def __init__(self, settings=None, gui_mode=False, lock=None, stats=True): self.url_queue = queue.Queue() self.data_queue = queue.Queue(maxsize=25) self.gui_url_queue = [] self.gui_mode = gui_mode self.lock = lock self.stats = stats self.list_mode_urls = None self.url_attempts = {} self.retries = 5 self.settings = settings self.gf = gf(self.settings, columns=None) self.crawl_running = Event() self.crawl_completed = Event() self.crawl_timed_out = Event() self.worker_status = [] self.db_file = None self.rate_limit_delay = 0 self.current_urls_per_second = 0 self.urls_crawled = 0 self.urls_total = 0 self.HEADERS = "" self.robots_txt = "" self.columns = None self.consumer_thread = None self.session = None self.header_only = False
def resume_crawl(self): print("Resuming crawl ...") self.init_crawl_headers() # Reinit session self.init_session() self.reset_crawl() db = self._connect_to_db() self.urls_crawled = db.get_urls_crawled() self.urls_total = db.get_total_urls() # Reset response object self.gf = gf(self.settings, columns=db.get_columns()) if self.settings['MODE'] != 'List': response = self.request_robots_txt( self.settings.get('STARTING_URL')) if response == 'SKIP_ME': self.crawl_timed_out.set() self.crawl_running.set() db.close() return # Reinit URL queue self.add_to_url_queue(db.get_url_queue(), count=False) db.commit() db.close() self.start_consumer() Thread(target=self.spawn_threads).start()
def resume_crawl(self) -> None: """Resumes a crawl using the settings from the connected database.""" print('Resuming crawl ...') self.reset_crawl() db = self._connect_to_db() self.urls_crawled = db.get_urls_crawled() self.urls_total = db.get_total_urls() # Create a new response object with the columns from the loaded databse self.gf = gf(self.settings, columns=db.get_columns()) if self.settings['MODE'] != 'List': response = self.request_robots_txt( self.settings.get('STARTING_URL')) if isinstance(response, str): self.crawl_timed_out.set() self.crawl_running.set() db.close() return # Reinit URL queue self.add_to_url_queue(db.get_url_queue(), count=False) db.close() self.start_consumer() Thread(target=self.spawn_threads).start()
def start_crawl(self): print("Crawl started") self.init_crawl_headers() self.init_session() # Set speed limit if int(self.settings.get("URLS_PER_SECOND", 0)) > 0: self.parallel_requests_limit = ( 1 / int(self.settings["URLS_PER_SECOND"])) * int( self.settings["THREADS"]) db = self._connect_to_db() db.create() # # Reset response object self.gf = gf(self.settings, columns=None) self.columns = self.gf.all_items = db.get_columns() if self.settings["MODE"] == "Spider": self.settings['STARTING_URL'] = self.gf.url_components_to_str( self.gf.parse_url(self.settings['STARTING_URL'])) self.settings["ROOT_DOMAIN"] = self.gf.get_domain( self.settings['STARTING_URL']) response = self.crawl_url(self.settings['STARTING_URL']) # Check if we are dealing with a reachable host if response == 'SKIP_ME': self.crawl_timed_out.set() self.crawl_running.set() db.close() return data = self.response_to_data(response) self.add_to_data_queue(data) self.request_robots_txt(data['url']) elif self.settings["MODE"] == "List": if len(self.list_mode_urls) > 0: self.add_to_url_queue(self.list_mode_urls) db.insert_new_urls(self.list_mode_urls) else: print("ERROR: No urls to list crawl found!") db.commit() db.close() self.start_consumer() Thread(target=self.spawn_threads).start()
def start_crawl(self) -> None: """Starts a new crawl using the config from self.settings""" print('Crawl started') self.init_crawl_headers() self.init_session() # Set speed limit if int(self.settings.get('URLS_PER_SECOND', 0)) > 0: self.parallel_requests_limit = ( 1 / int(self.settings['URLS_PER_SECOND'])) * int( self.settings['THREADS']) db = self._connect_to_db() db.create() db.insert_config(self.settings) # # Reset response object self.gf = gf(self.settings, columns=None) self.columns = self.gf.all_items = db.get_columns() if self.settings['MODE'] == 'Spider': self.settings['ROOT_DOMAIN'] = self.gf.get_domain( self.settings['STARTING_URL']) response = self.crawl_url(self.settings['STARTING_URL']) # Check if we are dealing with a reachable host if isinstance(response, str): self.crawl_timed_out.set() self.crawl_running.set() db.close() return self.request_robots_txt(response.url) data = self.response_to_data(response) self.add_to_data_queue(data) elif self.settings['MODE'] == 'List': if len(self.list_mode_urls) > 0: self.add_to_url_queue(self.list_mode_urls) db.insert_new_urls(self.list_mode_urls) else: print('ERROR: No urls to list crawl found!') db.close() self.start_consumer() Thread(target=self.spawn_threads).start()
def reset_crawl(self): # Reset queue if self.settings['MODE'] != 'List': self.data_queue = queue.Queue(maxsize=25) self.url_queue = queue.Queue() self.gui_url_queue = [] self.url_attempts = {} self.gf = gf(self.settings, columns=None) self.crawl_running.clear() self.crawl_completed.clear() self.crawl_timed_out.clear() self.urls_crawled = 0 self.urls_total = 0
def reset_crawl(self) -> None: """Reset crawl to default state in preparation for a new crawl. """ # Reset queue if self.settings['MODE'] != 'List': self.data_queue = queue.Queue(maxsize=25) self.url_queue = queue.Queue() self.gui_url_queue = [] self.url_attempts = {} self.init_crawl_headers() self.init_session() self.active_workers = 0 self.gf = gf(self.settings, columns=None) self.crawl_running.clear() self.crawl_completed.clear() self.crawl_timed_out.clear() self.urls_crawled = 0 self.urls_total = 0