示例#1
0
def main():
    logger = Logger('parser.log', constants.LOG_DIR).get_logger()
    logger.info("parser is starting")
    create_dir(constants.PARSED_SUCCESS_DIR)
    create_dir(constants.PARSED_FAILED_DIR)
    create_dir(constants.PARSED_FAILED_VALIDATION_DIR)
    while True:
        cases = get_cases(constants.SCRAPED_DIR)
        run(logger, cases)
        call_sleep(logger=logger, minutes=10)
def main():
    logger = Logger('elasticsearch.log', constants.LOG_DIR).get_logger()
    elastic = Elastic(logger)
    index_created = elastic.init_index()
    if index_created:
        logger.info(
            f"{constants.ELASTIC_INDEX_NAME} index created successfully")
        while True:
            elastic.run()
            elastic.save_all()
            call_sleep(logger=logger, minutes=10)
import threading
from concurrent.futures import ThreadPoolExecutor
from hcva.utils import constants
from hcva.scraper import scraper
from hcva.utils.database import Database
from hcva.utils.logger import Logger
from hcva.utils.path import create_dir

logger = Logger('app.log', constants.LOG_DIR).get_logger()
db = Database()


def scrape(date):
    date = date['date']
    logger.info(f'starting thread #{threading.current_thread().name} for date: {date}')
    try:
        scraper.get(date)
        logger.info(f'saving {date} cases to filesystem')
        db.update_status(date, 'done')
    except Exception as e:
        logger.info(f'failed to scrape date: {date}, reason: {e}')
        db.update_status(date, 'error')
    logger.info(f'thread #{threading.current_thread().name} finished')


def main():
    create_dir(constants.SCRAPED_DIR)
    db.init_collection(constants.DB_NAME, constants.COLLECTION_NAME)
    dates = db.get_dates()
    threads = int(constants.NUM_OF_CRAWLERS)
    with ThreadPoolExecutor(max_workers=threads) as executor:
class Database:
    logger = Logger('db.log', constants.LOG_DIR).get_logger()
    collection = None

    def __init__(self):
        self.client = MongoClient(constants.DB_URI)
        self.get_connection()

    def get_connection(self):
        try:
            self.logger.info('db trying to connect...')
            connection = self.client
            self.logger.info('db connected')
        except ServerSelectionTimeoutError as err:
            message = 'db connection Timeout - check for if this machine ip is on whitelist'
            if self.logger is not None:
                self.logger.exception(f'{message} {err}')
            else:
                print(f'{message} {err}')
            connection = None
        return connection

    def get_db(self, name):
        self.logger.info(f'db trying to get db: {name}')
        db = self.client[name]
        self.logger.info(f'got db: {db.name}')
        return db

    def get_collection(self, db, collection_name):
        collection = db.get_collection(collection_name)
        self.logger.info(f'got collection: {collection_name}')
        return collection

    # date format: %d-%m-%Y
    def update_status(self, date, status):
        self.logger.info(f'setting {date} status to: {status}')
        self.collection.update_one({
            'date': date
        }, {
            '$set': {
                'status': status
            }
        })

    def create_date(self, date):
        self.collection.insert({
            'date': date,
            'status': 'available'
        })

    def get_dates(self):
        res = self.collection.find({
            'status': 'available'
        }, {
            '_id': 0,
            'status': 0
        })
        self.logger.info(f'found {res.count()} dates')
        return res

    def create_collection(self):
        docs = create_docs()
        self.collection.insert_many(docs)

    def init_collection(self, db_name, collection_name):
        self.collection = self.client[db_name].get_collection(collection_name)
        if self.collection.count() == 0:
            self.logger.info(f'initializing collection: {collection_name}')
            self.create_collection()

        self.logger.info(f'collection @{collection_name} initialized')

        return self.collection
import time
from hcva.scraper.crawler import Crawler
from hcva.utils import constants
from hcva.utils.json import save_data
from hcva.utils.time import call_sleep
from hcva.utils.logger import Logger

logger = Logger('scraper.log', constants.LOG_DIR).get_logger()
BASE_URL = 'https://supreme.court.gov.il/Pages/SearchJudgments.aspx?&DateType=2&freeText=null&CaseNumber=null'


def build_url(date):
    return f'{BASE_URL}&COpenDate={date}&CEndDate={date}'


def get_frame(crawler, elem_type, string):
    frame = crawler.find_elem(elem_type, string)
    if frame is not None:
        return crawler.switch_frame(frame)

    print(f'could not switch to frame: {string}')
    return False


def get_num_cases(crawler):
    case_num_loc = ['/html/body/div[2]/div/form/section/div/div']
    crawler.switch_to_default_content()
    get_frame(crawler, 'id', 'serviceFram')
    for location in case_num_loc:
        elem = crawler.find_elem('xpath', location, delay=8)
        if elem is not None:
示例#6
0
def main():
    _logger = Logger('elasticsearch.log', get_path(n=0) + f'logs{sep}').get_logger()
    while True:
        Elastic_5_5_3(_logger).start_index()  # start index product to elastic DB
        call_sleep(logger=_logger, minutes=10)  # after finished with all the files wait a bit - hours * minutes * seconds
示例#7
0
 def __init__(self, url):
     self._logger = Logger(f'crawler_{threading.current_thread().name}.log',
                           constants.LOG_DIR).get_logger()
     self._driver = self.get_browser()
     self._driver.get(url)
     self._logger.info('crawler created')
示例#8
0
class Crawler:
    _driver = None  # Web Driver
    _delay = 2  # Timer for finding web element as int
    _url = None  # starting page as string
    _text_query = None  # latest text get as string
    _logger = None  # logging log class

    def __init__(self, url):
        self._logger = Logger(f'crawler_{threading.current_thread().name}.log',
                              constants.LOG_DIR).get_logger()
        self._driver = self.get_browser()
        self._driver.get(url)
        self._logger.info('crawler created')

    def get_browser(self):
        browser = constants.BROWSER_TYPE
        os_type = get_os_type()
        driver_prefix = constants.ROOT_DIR + f'/hcva/scraper/web_drivers/{os_type}'
        driver_postfix = ''
        if os_type == 'windows':
            driver_postfix = '.exe'

        self._logger.debug(f'attempting to open browser: {browser}')
        if browser == 'chrome':
            driver_prefix += '/chromedriver'

            options = webdriver.ChromeOptions()
            if constants.HEADLESS == 'true':
                options.add_argument("--headless")

            return webdriver.Chrome(chrome_options=options,
                                    executable_path=driver_prefix +
                                    driver_postfix)
        elif browser == 'firefox':
            driver_prefix += '/geckodriver'

            options = webdriver.FirefoxOptions()
            if constants.HEADLESS == 'true':
                options.add_argument("--headless")

            return webdriver.Firefox(firefox_options=options,
                                     executable_path=driver_prefix +
                                     driver_postfix)
        else:
            self._logger.error(f'browser type is invalid: ${browser}')

    # input - update as boolean
    # output - return string if true, else None
    # do - return the last text that got scraped
    def get_text_query(self, update=True):
        if update:
            return self._text_query
        return None

    # output - return True if successful
    # do - close web driver
    def close(self):
        self._driver.quit()  # close the browser
        message = 'Closing browser'
        self._logger.info(message)
        return True

    # output - return True if successful
    # do - return to previous page
    def go_back(self):
        self._driver.back()
        message = 'went to previous page'
        self._logger.info(message)
        return True

    # output - return True if successful
    # do - refresh page loaded on crawler
    def refresh(self):
        self._driver.refresh()
        message = 'Refresh page'
        self._logger.info(message)
        return True

    # input - frame as web element
    # do
    def switch_frame(self, frame):
        self._driver.switch_to.frame(frame)
        message = 'switch to frame'
        self._logger.info(message)
        return True

    # do - switch windows handle after case was clicked
    def switch_window_handle(self, index=0):
        window = self._driver.window_handles[index]
        self._driver.switch_to.window(window)
        message = 'switch window handle'
        self._logger.info(message)
        return True

    # do - switch to default content
    def switch_to_default_content(self):
        self._driver.switch_to.default_content()
        message = 'switch to default content'
        self._logger.info(message)
        return True

    def get_page_source(self):
        page_source = self._driver.page_source
        message = 'Got page Source'
        self._logger.info(message)
        return page_source

    # input - elem_type as string, string as string
    # output - return element if found in <delay> seconds, None otherwise
    def find_elem(self,
                  elem_type,
                  string,
                  single_element=True,
                  driver=None,
                  delay=2,
                  raise_error=True):
        driver = self._driver if driver is None else driver
        message = ''
        try:
            elem = None
            message = f'found elem: {string}, type: {elem_type}'
            if single_element:
                if elem_type == 'xpath':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.XPATH, string)))
                    elem = driver.find_element_by_xpath(string)
                elif elem_type == 'id':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.ID, string)))
                    elem = driver.find_element_by_id(string)
                elif elem_type == 'tag':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.TAG_NAME, string)))
                    elem = driver.find_element_by_tag_name(string)
                elif elem_type == 'name':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.NAME, string)))
                    elem = driver.find_element_by_name(string)
                elif elem_type == 'link_text':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.LINK_TEXT, string)))
                    elem = driver.find_element_by_link_text(string)
                elif elem_type == 'partial_link_text':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.PARTIAL_LINK_TEXT, string)))
                    elem = driver.find_element_by_partial_link_text(string)
                elif elem_type == 'css':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, string)))
                    elem = driver.find_element_by_css_selector(string)
                elif elem_type == 'class_name':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, string)))
                    elem = driver.find_element_by_class_name(string)
                else:
                    message = f'find_element function do not have: {elem_type}'
            else:
                if elem_type == 'xpath':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.XPATH, string)))
                    elem = driver.find_elements_by_xpath(string)
                elif elem_type == 'id':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.ID, string)))
                    elem = driver.find_elements_by_id(string)
                elif elem_type == 'tag':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.TAG_NAME, string)))
                    elem = driver.find_elements_by_tag_name(string)
                elif elem_type == 'name':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.NAME, string)))
                    elem = driver.find_elements_by_name(string)
                elif elem_type == 'link_text':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located((By.LINK_TEXT, string)))
                    elem = driver.find_elements_by_link_text(string)
                elif elem_type == 'partial_link_text':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.PARTIAL_LINK_TEXT, string)))
                    elem = driver.find_elements_by_partial_link_text(string)
                elif elem_type == 'css':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, string)))
                    elem = driver.find_elements_by_css_selector(string)
                elif elem_type == 'class_name':
                    WebDriverWait(driver, delay).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, string)))
                    elem = driver.find_elements_by_class_name(string)
                else:
                    message = f'find_element function do not have: {elem_type}'
            self._logger.info(message)
            return elem

        except TimeoutException as _:  # did not found elem in time
            if raise_error:
                message = f'Did not find elem: {string}, type: {elem_type}, delay: {delay} in time'
                self._logger.exception(message)
            return None

        except ElementNotVisibleException as _:  # did not found elem
            if raise_error:
                message = f'Elem is not visible: {string}, type: {elem_type}'
                self._logger.exception(message)
            return None

        except NoSuchElementException as _:  # did not found elem
            if raise_error:
                message = f'No Such elem: {string}, type: {elem_type}'
                self._logger.exception(message)
            return None
        finally:
            if raise_error is False:
                self._logger.error(message)

    # input - driver as web driver, elem as web element
    # output - return True if successful, otherwise False
    # do - hover the elem
    def hover_elem(self, driver, elem):
        try:
            hover = ActionChains(driver).move_to_element(elem)
            hover.perform()
            message = 'elem got hovered'
            self._logger.info(message)
            return True

        except Exception as _:
            message = 'Could not hover that'
            self._logger.exception(message)
            return False

    # input - elem as web element, value as string, msg as string
    # output - return True if successful, otherwise False
    # do - select the option in elem
    def select_elem(self, elem, option):
        message = None
        if type(option) is not str:  # if value is not string
            message = f'option should be string and not {type(option)}'
        try:
            select = Select(elem)  # select the elem
            select.select_by_visible_text(option)  # select by text
            message = 'elem got selected'
            self._logger.info(message)
            return True

        except ElementNotSelectableException as _:
            if message is None:
                message = 'Could not select that'
            self._logger.error(message)
            return False

    # input - elem as web element, msg as string
    # output - return True if successful, otherwise False
    def click_elem(self, elem):
        try:
            if elem is not None:
                elem.click()  # click the elem
                message = 'element got clicked'
            else:
                message = 'didnt got element to click - got None instead'
            self._logger.info(message)
            return True

        except ElementClickInterceptedException as _:
            message = 'Element Click Intercepted'
            self._logger.exception(message)
            return False

        except ElementNotInteractableException as _:
            message = 'Element Not Interactable'
            self._logger.exception(message)
            return False

    # input - elem as web element, data as string
    # output - return True if successful, otherwise False
    # do - send the text box elem string
    def send_data_to_elem(self, elem, data, to_clear=True):
        try:
            message = ''
            if to_clear:
                elem.clear()  # clear text box
                message = 'text box got cleared'
            elem.send_keys(data)  # type sting into text box
            message += ', element got the data'
            self._logger.info(message)
            return True

        except Exception as _:
            message = 'Could not send elem this data'
            self._logger.exception(message)
            return False

    # input - elem as web element
    # output - return True if successful, otherwise False
    # do - get the elem text
    def read_elem_text(self, elem):
        try:
            text = elem.text  # get elem text
            self._text_query = text
            message = f'Got text from elem: {text}'
            self._logger.info(message)
            return True

        except Exception as _:
            self._text_query = None
            message = 'Could not get elem text'
            self._logger.exception(message)
            return False

    # input - driver as web driver, N is the index we want to reach as int
    # do - put in view the item we want
    @staticmethod
    def scroll_to_elem(elem):
        if elem is not None:
            elem.location_once_scrolled_into_view  # don't put ()
            return True
        return False

    # input - driver as web driver, elem as web element, string as string
    # output - return True if successful, False as stop flag, massage as string
    # do - execute script on element
    def exec_script(self, driver, elem, string):
        try:
            driver.execute_script(string, elem)
            message = 'Script executed'
            self._logger.info(message)
            return True

        except JavascriptException as _:
            message = 'Could not execute script'
            self._logger.exception(message)
            return False

    # input - result as string
    # output - return True if successful
    # do - return massage if result none, accept alert if result accept so accept, if result dismiss so dismiss
    def alert_handle(self, driver=None, result=None):
        driver = self._driver if driver is None else driver
        try:
            obj = driver.switch_to.alert  # driver focus on alert window
            text = f'alert massage says: {obj.text}'  # take alert window massage

            if result is not None:
                if result == 'accept':
                    obj.accept()  # accept alert window
                elif result == 'dismiss':
                    obj.dismiss()  # dismiss alert window

            driver.switch_to.default_content()  # return to main window

            self._text_query = text
            message = f'Alert say: {text}'
            self._logger.info(message)
            return True

        except NoAlertPresentException as _:
            message = 'Did not find any alert'
            self._logger.error(message)
            return False