def export_data_json(self, filename='posts_content.json', overwrite=False, indent_level=3, sort_keys=False): if self.posts_content is not None: Writer.dict_to_json(json_filename=filename, content=self.posts_content, overwrite=overwrite, indent_level=indent_level, sort_keys=sort_keys) else: error_log = { 'error_type': 'ValueError', 'message': 'No data to export' } Logger.write_messages_json(error_log) # Log Error Logger.error('Export failed, Check log file')
def __get_data__(self): if self.metadata is None: error_log = { 'error_type': 'ValueError', 'message': 'Not urls to iterate through' } Logger.write_messages_json(error_log) # Log Error Logger.error(initialization_error_msg) return None self.posts_content = dict() for topic, metadata in self.metadata.items(): n_post = len(metadata['url']) Logger.info(f'Begin Scraping : {topic}') for i in range(n_post): Logger.info_r(f'scraped content : {i + 1}/{n_post}') self.__get_post_content__(url=metadata['url'][i]) Logger.info(f'End Scraping : {topic}') Logger.set_line(length=50)
def scrape_content_from_file(self, metadata_filename='posts_metadata.json', export_json=True, export_csv=True, export_overwrite=True, timeout_export=False, set_quit=True): try: _metadata = Reader.json_to_dict(metadata_filename) setattr(self, 'metadata', _metadata) if not hasattr(self, 'posts_content'): setattr(self, 'posts_content', dict()) n_post = self.get_posts_count() Logger.info('No. of posts :', str(n_post)) if n_post > 0: setattr(self, 'timeout_export', timeout_export) setattr(self, 'export_json', export_json) setattr(self, 'export_csv', export_csv) self.__get_data__() if export_json: self.export_data_json(filename='posts_content.json', overwrite=export_overwrite, indent_level=3, sort_keys=False) if export_csv: self.export_data_csv(filename='posts_content.csv', overwrite=export_overwrite) except (WebDriverException, ScraperException) as error: # Log error Logger.error(error) if set_quit: self.quit() else: if set_quit: self.quit()
def run(self, scrape_content=False, export_metadata_json=True, export_metadata_csv=True, export_data_json=True, export_data_csv=True, export_overwrite=True, set_quit=True): try: self.__get_posts_metadata__() Logger.info('No. of posts :', str(self.get_posts_count())) if export_metadata_json: self.export_metadata_json(filename='posts_metadata.json', overwrite=export_overwrite, indent_level=3, sort_keys=False) if export_metadata_csv: self.export_metadata_csv(filename='posts_metadata.csv', overwrite=export_overwrite) if scrape_content: self.__get_data__() if export_data_json: self.export_data_json(filename='posts_content.json', overwrite=export_overwrite, indent_level=3, sort_keys=False) if export_data_csv: self.export_data_csv(filename='posts_content.csv', overwrite=export_overwrite) except (WebDriverException, ScraperException) as error: # Log error Logger.error(error) if set_quit: self.quit() else: if set_quit: self.quit()
def export_data_csv(self, filename='posts_content.csv', overwrite=False): if self.posts_content is not None: Writer.dict_to_csv(csv_filename=filename, content=self.posts_content, overwrite=overwrite, use_pandas=True) else: error_log = { 'error_type': 'ValueError', 'message': 'No data to export' } Logger.write_messages_json(error_log) # Log Error Logger.error('Export failed, Check log file')
def init_model(self, set_quit=True): try: self.__init__model__() Logger.info('No. of topics :', str(len(self.topics_urls))) Logger.set_line(length=50) except (WebDriverException, ScraperException) as error: # Log error Logger.error(error) if set_quit: self.quit() else: if set_quit: self.quit()
def __init_web_driver__(self): self.__init_driver_options__() if self.browser == 'chrome': self.driver_path = self.__os_process__.locate_file( pattern='/chromedriver$', params='-i --regexp', updatedb=self.__updatedb__)[0] if self.driver_path is None: Logger.error(initialization_error_msg + 'Can\'t Find Chrome Driver Executable File') self.driver = webdriver.Chrome(options=self.options, executable_path=self.driver_path) elif self.browser == 'firefox': self.driver_path = self.__os_process__.locate_file( pattern='/geckodriver$', params='-i --regexp', updatedb=self.__updatedb__)[0] if self.driver_path is None: Logger.error(initialization_error_msg + 'Can\'t Find Firefox Driver Executable File') self.driver = webdriver.Firefox(options=self.options, executable_path=self.driver_path) else: # Log Error pass
def export_metadata_csv(self, filename='posts_urls.csv', overwrite=False): if self.metadata is not None: keys = list(self.metadata.keys()) metadata = {key: [] for key in self.metadata[keys[0]].keys()} metadata.setdefault('topic', []) for topic in self.metadata.keys(): topic_name = [topic] * len(self.metadata[topic]['url']) metadata['topic'] += topic_name for key in self.metadata[topic].keys(): values = self.metadata[topic][key] metadata[key] += values Writer.dict_to_csv(csv_filename=filename, content=metadata, overwrite=overwrite, use_pandas=True) else: error_log = { 'error_type': 'ValueError', 'message': 'No urls to export' } Logger.write_messages_json(error_log) # Log Error Logger.error('Export failed, Check log file')
def get(self, url): self.driver.set_page_load_timeout(time_to_wait=self.time_to_wait) if 'script_timeout' in self.kwargs.keys(): self.driver.set_script_timeout( time_to_wait=self.kwargs['script_timeout']) else: self.driver.set_script_timeout(0.001) for i in range(self.reload_page_count): try: self.driver.get(url=url) break except TimeoutException as error: if i < self.reload_page_count - 1: Logger.fail( str(i + 1) + ': timeout::page has been reloaded') Logger.set_line(length=60) else: Logger.fail( str(i + 1) + ': timeout::page reload Limit has been exceed\n' '\tdo you want to try again - [y/n]: ', end='') ok = input() Logger.set_line(length=60) if ok.lower() == 'y': self.time_to_wait = float(input('time to wait :')) self.reload_page_count = int(input('reload count :')) self.get(url) elif ok.lower() == 'n': self.___timeout_export__() Logger.error(error) else: self.___timeout_export__() Logger.fail('Abort') Logger.error(error) self.scroll_height = self.driver.execute_script( "return document.body.scrollHeight")