def main(): logger = Logger('parser.log', constants.LOG_DIR).get_logger() logger.info("parser is starting") create_dir(constants.PARSED_SUCCESS_DIR) create_dir(constants.PARSED_FAILED_DIR) create_dir(constants.PARSED_FAILED_VALIDATION_DIR) while True: cases = get_cases(constants.SCRAPED_DIR) run(logger, cases) call_sleep(logger=logger, minutes=10)
def main(): logger = Logger('elasticsearch.log', constants.LOG_DIR).get_logger() elastic = Elastic(logger) index_created = elastic.init_index() if index_created: logger.info( f"{constants.ELASTIC_INDEX_NAME} index created successfully") while True: elastic.run() elastic.save_all() call_sleep(logger=logger, minutes=10)
import threading from concurrent.futures import ThreadPoolExecutor from hcva.utils import constants from hcva.scraper import scraper from hcva.utils.database import Database from hcva.utils.logger import Logger from hcva.utils.path import create_dir logger = Logger('app.log', constants.LOG_DIR).get_logger() db = Database() def scrape(date): date = date['date'] logger.info(f'starting thread #{threading.current_thread().name} for date: {date}') try: scraper.get(date) logger.info(f'saving {date} cases to filesystem') db.update_status(date, 'done') except Exception as e: logger.info(f'failed to scrape date: {date}, reason: {e}') db.update_status(date, 'error') logger.info(f'thread #{threading.current_thread().name} finished') def main(): create_dir(constants.SCRAPED_DIR) db.init_collection(constants.DB_NAME, constants.COLLECTION_NAME) dates = db.get_dates() threads = int(constants.NUM_OF_CRAWLERS) with ThreadPoolExecutor(max_workers=threads) as executor:
class Database: logger = Logger('db.log', constants.LOG_DIR).get_logger() collection = None def __init__(self): self.client = MongoClient(constants.DB_URI) self.get_connection() def get_connection(self): try: self.logger.info('db trying to connect...') connection = self.client self.logger.info('db connected') except ServerSelectionTimeoutError as err: message = 'db connection Timeout - check for if this machine ip is on whitelist' if self.logger is not None: self.logger.exception(f'{message} {err}') else: print(f'{message} {err}') connection = None return connection def get_db(self, name): self.logger.info(f'db trying to get db: {name}') db = self.client[name] self.logger.info(f'got db: {db.name}') return db def get_collection(self, db, collection_name): collection = db.get_collection(collection_name) self.logger.info(f'got collection: {collection_name}') return collection # date format: %d-%m-%Y def update_status(self, date, status): self.logger.info(f'setting {date} status to: {status}') self.collection.update_one({ 'date': date }, { '$set': { 'status': status } }) def create_date(self, date): self.collection.insert({ 'date': date, 'status': 'available' }) def get_dates(self): res = self.collection.find({ 'status': 'available' }, { '_id': 0, 'status': 0 }) self.logger.info(f'found {res.count()} dates') return res def create_collection(self): docs = create_docs() self.collection.insert_many(docs) def init_collection(self, db_name, collection_name): self.collection = self.client[db_name].get_collection(collection_name) if self.collection.count() == 0: self.logger.info(f'initializing collection: {collection_name}') self.create_collection() self.logger.info(f'collection @{collection_name} initialized') return self.collection
import time from hcva.scraper.crawler import Crawler from hcva.utils import constants from hcva.utils.json import save_data from hcva.utils.time import call_sleep from hcva.utils.logger import Logger logger = Logger('scraper.log', constants.LOG_DIR).get_logger() BASE_URL = 'https://supreme.court.gov.il/Pages/SearchJudgments.aspx?&DateType=2&freeText=null&CaseNumber=null' def build_url(date): return f'{BASE_URL}&COpenDate={date}&CEndDate={date}' def get_frame(crawler, elem_type, string): frame = crawler.find_elem(elem_type, string) if frame is not None: return crawler.switch_frame(frame) print(f'could not switch to frame: {string}') return False def get_num_cases(crawler): case_num_loc = ['/html/body/div[2]/div/form/section/div/div'] crawler.switch_to_default_content() get_frame(crawler, 'id', 'serviceFram') for location in case_num_loc: elem = crawler.find_elem('xpath', location, delay=8) if elem is not None:
def main(): _logger = Logger('elasticsearch.log', get_path(n=0) + f'logs{sep}').get_logger() while True: Elastic_5_5_3(_logger).start_index() # start index product to elastic DB call_sleep(logger=_logger, minutes=10) # after finished with all the files wait a bit - hours * minutes * seconds
def __init__(self, url): self._logger = Logger(f'crawler_{threading.current_thread().name}.log', constants.LOG_DIR).get_logger() self._driver = self.get_browser() self._driver.get(url) self._logger.info('crawler created')
class Crawler: _driver = None # Web Driver _delay = 2 # Timer for finding web element as int _url = None # starting page as string _text_query = None # latest text get as string _logger = None # logging log class def __init__(self, url): self._logger = Logger(f'crawler_{threading.current_thread().name}.log', constants.LOG_DIR).get_logger() self._driver = self.get_browser() self._driver.get(url) self._logger.info('crawler created') def get_browser(self): browser = constants.BROWSER_TYPE os_type = get_os_type() driver_prefix = constants.ROOT_DIR + f'/hcva/scraper/web_drivers/{os_type}' driver_postfix = '' if os_type == 'windows': driver_postfix = '.exe' self._logger.debug(f'attempting to open browser: {browser}') if browser == 'chrome': driver_prefix += '/chromedriver' options = webdriver.ChromeOptions() if constants.HEADLESS == 'true': options.add_argument("--headless") return webdriver.Chrome(chrome_options=options, executable_path=driver_prefix + driver_postfix) elif browser == 'firefox': driver_prefix += '/geckodriver' options = webdriver.FirefoxOptions() if constants.HEADLESS == 'true': options.add_argument("--headless") return webdriver.Firefox(firefox_options=options, executable_path=driver_prefix + driver_postfix) else: self._logger.error(f'browser type is invalid: ${browser}') # input - update as boolean # output - return string if true, else None # do - return the last text that got scraped def get_text_query(self, update=True): if update: return self._text_query return None # output - return True if successful # do - close web driver def close(self): self._driver.quit() # close the browser message = 'Closing browser' self._logger.info(message) return True # output - return True if successful # do - return to previous page def go_back(self): self._driver.back() message = 'went to previous page' self._logger.info(message) return True # output - return True if successful # do - refresh page loaded on crawler def refresh(self): self._driver.refresh() message = 'Refresh page' self._logger.info(message) return True # input - frame as web element # do def switch_frame(self, frame): self._driver.switch_to.frame(frame) message = 'switch to frame' self._logger.info(message) return True # do - switch windows handle after case was clicked def switch_window_handle(self, index=0): window = self._driver.window_handles[index] self._driver.switch_to.window(window) message = 'switch window handle' self._logger.info(message) return True # do - switch to default content def switch_to_default_content(self): self._driver.switch_to.default_content() message = 'switch to default content' self._logger.info(message) return True def get_page_source(self): page_source = self._driver.page_source message = 'Got page Source' self._logger.info(message) return page_source # input - elem_type as string, string as string # output - return element if found in <delay> seconds, None otherwise def find_elem(self, elem_type, string, single_element=True, driver=None, delay=2, raise_error=True): driver = self._driver if driver is None else driver message = '' try: elem = None message = f'found elem: {string}, type: {elem_type}' if single_element: if elem_type == 'xpath': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.XPATH, string))) elem = driver.find_element_by_xpath(string) elif elem_type == 'id': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.ID, string))) elem = driver.find_element_by_id(string) elif elem_type == 'tag': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.TAG_NAME, string))) elem = driver.find_element_by_tag_name(string) elif elem_type == 'name': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.NAME, string))) elem = driver.find_element_by_name(string) elif elem_type == 'link_text': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.LINK_TEXT, string))) elem = driver.find_element_by_link_text(string) elif elem_type == 'partial_link_text': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.PARTIAL_LINK_TEXT, string))) elem = driver.find_element_by_partial_link_text(string) elif elem_type == 'css': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.CSS_SELECTOR, string))) elem = driver.find_element_by_css_selector(string) elif elem_type == 'class_name': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.CLASS_NAME, string))) elem = driver.find_element_by_class_name(string) else: message = f'find_element function do not have: {elem_type}' else: if elem_type == 'xpath': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.XPATH, string))) elem = driver.find_elements_by_xpath(string) elif elem_type == 'id': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.ID, string))) elem = driver.find_elements_by_id(string) elif elem_type == 'tag': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.TAG_NAME, string))) elem = driver.find_elements_by_tag_name(string) elif elem_type == 'name': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.NAME, string))) elem = driver.find_elements_by_name(string) elif elem_type == 'link_text': WebDriverWait(driver, delay).until( EC.presence_of_element_located((By.LINK_TEXT, string))) elem = driver.find_elements_by_link_text(string) elif elem_type == 'partial_link_text': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.PARTIAL_LINK_TEXT, string))) elem = driver.find_elements_by_partial_link_text(string) elif elem_type == 'css': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.CSS_SELECTOR, string))) elem = driver.find_elements_by_css_selector(string) elif elem_type == 'class_name': WebDriverWait(driver, delay).until( EC.presence_of_element_located( (By.CLASS_NAME, string))) elem = driver.find_elements_by_class_name(string) else: message = f'find_element function do not have: {elem_type}' self._logger.info(message) return elem except TimeoutException as _: # did not found elem in time if raise_error: message = f'Did not find elem: {string}, type: {elem_type}, delay: {delay} in time' self._logger.exception(message) return None except ElementNotVisibleException as _: # did not found elem if raise_error: message = f'Elem is not visible: {string}, type: {elem_type}' self._logger.exception(message) return None except NoSuchElementException as _: # did not found elem if raise_error: message = f'No Such elem: {string}, type: {elem_type}' self._logger.exception(message) return None finally: if raise_error is False: self._logger.error(message) # input - driver as web driver, elem as web element # output - return True if successful, otherwise False # do - hover the elem def hover_elem(self, driver, elem): try: hover = ActionChains(driver).move_to_element(elem) hover.perform() message = 'elem got hovered' self._logger.info(message) return True except Exception as _: message = 'Could not hover that' self._logger.exception(message) return False # input - elem as web element, value as string, msg as string # output - return True if successful, otherwise False # do - select the option in elem def select_elem(self, elem, option): message = None if type(option) is not str: # if value is not string message = f'option should be string and not {type(option)}' try: select = Select(elem) # select the elem select.select_by_visible_text(option) # select by text message = 'elem got selected' self._logger.info(message) return True except ElementNotSelectableException as _: if message is None: message = 'Could not select that' self._logger.error(message) return False # input - elem as web element, msg as string # output - return True if successful, otherwise False def click_elem(self, elem): try: if elem is not None: elem.click() # click the elem message = 'element got clicked' else: message = 'didnt got element to click - got None instead' self._logger.info(message) return True except ElementClickInterceptedException as _: message = 'Element Click Intercepted' self._logger.exception(message) return False except ElementNotInteractableException as _: message = 'Element Not Interactable' self._logger.exception(message) return False # input - elem as web element, data as string # output - return True if successful, otherwise False # do - send the text box elem string def send_data_to_elem(self, elem, data, to_clear=True): try: message = '' if to_clear: elem.clear() # clear text box message = 'text box got cleared' elem.send_keys(data) # type sting into text box message += ', element got the data' self._logger.info(message) return True except Exception as _: message = 'Could not send elem this data' self._logger.exception(message) return False # input - elem as web element # output - return True if successful, otherwise False # do - get the elem text def read_elem_text(self, elem): try: text = elem.text # get elem text self._text_query = text message = f'Got text from elem: {text}' self._logger.info(message) return True except Exception as _: self._text_query = None message = 'Could not get elem text' self._logger.exception(message) return False # input - driver as web driver, N is the index we want to reach as int # do - put in view the item we want @staticmethod def scroll_to_elem(elem): if elem is not None: elem.location_once_scrolled_into_view # don't put () return True return False # input - driver as web driver, elem as web element, string as string # output - return True if successful, False as stop flag, massage as string # do - execute script on element def exec_script(self, driver, elem, string): try: driver.execute_script(string, elem) message = 'Script executed' self._logger.info(message) return True except JavascriptException as _: message = 'Could not execute script' self._logger.exception(message) return False # input - result as string # output - return True if successful # do - return massage if result none, accept alert if result accept so accept, if result dismiss so dismiss def alert_handle(self, driver=None, result=None): driver = self._driver if driver is None else driver try: obj = driver.switch_to.alert # driver focus on alert window text = f'alert massage says: {obj.text}' # take alert window massage if result is not None: if result == 'accept': obj.accept() # accept alert window elif result == 'dismiss': obj.dismiss() # dismiss alert window driver.switch_to.default_content() # return to main window self._text_query = text message = f'Alert say: {text}' self._logger.info(message) return True except NoAlertPresentException as _: message = 'Did not find any alert' self._logger.error(message) return False