def __init__(self, search_session, name, autostart=True, dataset_type=GenericDataset, default_dataset_dir="/tmp/", publish_dir="/var/www/html/", autoclose_search_session_on_exit=False, on_finished=None): Service.__init__(self) self.search_session = search_session self.dataset = dataset_type( name, self.search_session, "{}".format(os.path.join(default_dataset_dir, name))) self.percent_crawled = 0 self.percent_fetched = 0 self.lock = Lock() self.autoclose_search_session_on_exit = autoclose_search_session_on_exit self.on_finished = on_finished self.name = name self.publish_dir = publish_dir if autostart: self.start()
def __internal_thread__(self): Service.__internal_thread__(self) do_sleep = 0 while not self.__get_stop_flag__(): with self.lock: if self.processing_queue.qsize() < QUEUE_BUFFER: try: search_request = self.search_session.pop_new_search_request( ) except: search_request = None if search_request: self.queue_request(search_request) else: do_sleep = 0.5 else: do_sleep = 0.3 with self.ping_lock: self.pong = self.ping if do_sleep: sleep(do_sleep) do_sleep = 0 self.process_queue() self.__set_status__(SERVICE_STOPPED)
def __init__(self, autostart=True, publish_dir="/tmp/"): Service.__init__(self) self.publish_dir = publish_dir with self.lock: self.datasets_builders_working = {} if autostart: self.start()
def __init__(self, autostart=True): Service.__init__(self) self.start_time = time.time() self.search_requests = {} # hash: request self.search_history = {} self.search_in_progress = {} self.finish_time = 0 if autostart: self.start()
def __init__(self, remote_url, crawler_processes=1, wait_time_between_tries=1): """ Initializes the crawling process for the specified URL. :param remote_url: URL of a dataset factory. :param crawler_processes: :param wait_time_between_tries: :return: """ Service.__init__(self) self.remote_url = remote_url self.crawler_processes = crawler_processes self.wait_time_between_tries = wait_time_between_tries self.crawler_service = None self.remote_dataset_factory = RemoteDatasetFactory(remote_url)
def __internal_thread__(self): Service.__internal_thread__(self) do_sleep = 0 while not self.__get_stop_flag__(): with self.lock: if self.processing_queue.qsize() < QUEUE_MIN_BUFFER: download_request = self.database.pop_url() if download_request: self.queue_download(download_request) else: do_sleep = 0.1 else: do_sleep = 0.1 if do_sleep: sleep(do_sleep) do_sleep = 0 self.__set_status__(SERVICE_STOPPED)
def __init__(self, search_session, time_secs_between_requests=0.5, processes=1): logging.info( "Initializing Crawler Service for {} processes and {} secs between requests." .format(processes, time_secs_between_requests)) Service.__init__(self) RequestPool.__init__(self, processes, time_secs_between_requests) self.time_secs_between_requests = time_secs_between_requests self.processes = processes self.search_session = search_session self.on_process_finished = None # Anti freeze system. Ping can be set externally, meanwhile pong is set internally. self.ping = 0 self.pong = 0 self.ping_lock = Lock() assert self.search_session logging.info( "Crawler Service initialized. Listening and waiting for requests.")
def __internal_thread__(self): Service.__internal_thread__(self) # 1. We wait for the async crawlers to finish the session percent_crawled = 0 percent_fetched = 0 previous_status = self.get_status() start_time = time.time() while not self.__get_stop_flag__() and self.search_session.size( ) == 0 and self.search_session.get_completion_progress() == 0: time.sleep(1) #print("Stop flag: {}".format(self.__get_stop_flag__())) while not self.__get_stop_flag__() and (percent_crawled < 100 or percent_fetched < 100): if percent_crawled < 100 or time.time( ) - start_time < DEFAULT_WAIT_TIME_SECONDS: if previous_status != SERVICE_CRAWLING_DATA: self.__set_status__(SERVICE_CRAWLING_DATA) previous_status = SERVICE_CRAWLING_DATA percent_crawled = self.search_session.get_completion_progress() else: if previous_status != SERVICE_FETCHING_DATA: self.__set_status__(SERVICE_FETCHING_DATA) previous_status = SERVICE_FETCHING_DATA self.dataset.fetch_data(False) percent_fetched = self.dataset.get_percent_fetched() with self.lock: self.percent_crawled = percent_crawled self.percent_fetched = percent_fetched time.sleep(0.05) if not self.__get_stop_flag__(): self.dataset.build_metadata() self.search_session.save_session( os.path.join(self.dataset.get_root_folder(), "search_session.ses")) self.__set_status__(SERVICE_FILTERING_DATA) # TODO: Invoke a filter for the data at this stage (if wanted) # It may be a good idea because it hasn't been packaged yet, however it may increase the load # of the machine. # The dataset content's are stored in self.dataset # The dataset folder is self.dataset.get_root_folder() # The metadata ground truth is located in self.dataset.get_metadata_file() self.__set_status__(SERVICE_COMPRESSING_DATA) self._make_archive() filename = "{}.zip".format(self.dataset.get_name()) self.__set_status__(SERVICE_PUBLISHING_DATA) move("./{}".format(filename), os.path.join(self.publish_dir, filename)) rmtree(self.dataset.get_root_folder()) self.__set_status__(SERVICE_CREATED_DATASET) del self.dataset if self.autoclose_search_session_on_exit: self.search_session.stop() if self.on_finished: self.on_finished(self.get_dataset_name())
def start(self): logging.info("Crawler started digesting requests.") Service.start(self) self.resume()
def stop(self, wait_for_finish=True): print("Stop of crawler service requested") logging.info("Crawler stopped from digesting requests.") Service.stop(self, wait_for_finish)
def __init__(self, to_folder): FetchPool.__init__(self, pool_limit=20) Service.__init__(self) self.database = MemDatabase(to_folder)
def stop(self, wait_for_finish=True): Service.stop(self, wait_for_finish=wait_for_finish) if self.crawler_service: self.crawler_service.stop(wait_for_finish=wait_for_finish)
def __init__(self, manager): Service.__init__(self) self.global_lock = Lock() self.status_table = manager.dict() self.manager = manager