예제 #1
0
    def __init__(self,
                 search_session,
                 name,
                 autostart=True,
                 dataset_type=GenericDataset,
                 default_dataset_dir="/tmp/",
                 publish_dir="/var/www/html/",
                 autoclose_search_session_on_exit=False,
                 on_finished=None):
        Service.__init__(self)
        self.search_session = search_session

        self.dataset = dataset_type(
            name, self.search_session,
            "{}".format(os.path.join(default_dataset_dir, name)))

        self.percent_crawled = 0
        self.percent_fetched = 0
        self.lock = Lock()
        self.autoclose_search_session_on_exit = autoclose_search_session_on_exit
        self.on_finished = on_finished
        self.name = name
        self.publish_dir = publish_dir

        if autostart:
            self.start()
예제 #2
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)
        do_sleep = 0

        while not self.__get_stop_flag__():

            with self.lock:
                if self.processing_queue.qsize() < QUEUE_BUFFER:

                    try:
                        search_request = self.search_session.pop_new_search_request(
                        )
                    except:
                        search_request = None

                    if search_request:
                        self.queue_request(search_request)

                    else:
                        do_sleep = 0.5

                else:
                    do_sleep = 0.3

            with self.ping_lock:
                self.pong = self.ping

            if do_sleep:
                sleep(do_sleep)
                do_sleep = 0

            self.process_queue()

        self.__set_status__(SERVICE_STOPPED)
예제 #3
0
    def __init__(self, autostart=True, publish_dir="/tmp/"):
        Service.__init__(self)

        self.publish_dir = publish_dir

        with self.lock:
            self.datasets_builders_working = {}

        if autostart:
            self.start()
예제 #4
0
    def __init__(self, autostart=True):
        Service.__init__(self)

        self.start_time = time.time()
        self.search_requests = {}  # hash: request
        self.search_history = {}
        self.search_in_progress = {}
        self.finish_time = 0

        if autostart:
            self.start()
예제 #5
0
    def __init__(self, remote_url, crawler_processes=1, wait_time_between_tries=1):
        """
        Initializes the crawling process for the specified URL.
        :param remote_url: URL of a dataset factory.
        :param crawler_processes:
        :param wait_time_between_tries:
        :return:
        """
        Service.__init__(self)
        self.remote_url = remote_url
        self.crawler_processes = crawler_processes
        self.wait_time_between_tries = wait_time_between_tries

        self.crawler_service = None
        self.remote_dataset_factory = RemoteDatasetFactory(remote_url)
예제 #6
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)
        do_sleep = 0

        while not self.__get_stop_flag__():

            with self.lock:
                if self.processing_queue.qsize() < QUEUE_MIN_BUFFER:
                    download_request = self.database.pop_url()

                    if download_request:
                        self.queue_download(download_request)
                    else:
                        do_sleep = 0.1

                else:
                    do_sleep = 0.1

            if do_sleep:
                sleep(do_sleep)
                do_sleep = 0

        self.__set_status__(SERVICE_STOPPED)
예제 #7
0
    def __init__(self,
                 search_session,
                 time_secs_between_requests=0.5,
                 processes=1):
        logging.info(
            "Initializing Crawler Service for {} processes and {} secs between requests."
            .format(processes, time_secs_between_requests))

        Service.__init__(self)
        RequestPool.__init__(self, processes, time_secs_between_requests)

        self.time_secs_between_requests = time_secs_between_requests
        self.processes = processes
        self.search_session = search_session
        self.on_process_finished = None

        # Anti freeze system. Ping can be set externally, meanwhile pong is set internally.
        self.ping = 0
        self.pong = 0
        self.ping_lock = Lock()

        assert self.search_session
        logging.info(
            "Crawler Service initialized. Listening and waiting for requests.")
예제 #8
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)

        # 1. We wait for the async crawlers to finish the session
        percent_crawled = 0
        percent_fetched = 0
        previous_status = self.get_status()
        start_time = time.time()

        while not self.__get_stop_flag__() and self.search_session.size(
        ) == 0 and self.search_session.get_completion_progress() == 0:
            time.sleep(1)

        #print("Stop flag: {}".format(self.__get_stop_flag__()))

        while not self.__get_stop_flag__() and (percent_crawled < 100
                                                or percent_fetched < 100):

            if percent_crawled < 100 or time.time(
            ) - start_time < DEFAULT_WAIT_TIME_SECONDS:
                if previous_status != SERVICE_CRAWLING_DATA:
                    self.__set_status__(SERVICE_CRAWLING_DATA)
                    previous_status = SERVICE_CRAWLING_DATA

                percent_crawled = self.search_session.get_completion_progress()

            else:
                if previous_status != SERVICE_FETCHING_DATA:
                    self.__set_status__(SERVICE_FETCHING_DATA)
                    previous_status = SERVICE_FETCHING_DATA

                    self.dataset.fetch_data(False)

                percent_fetched = self.dataset.get_percent_fetched()

            with self.lock:
                self.percent_crawled = percent_crawled
                self.percent_fetched = percent_fetched

            time.sleep(0.05)

        if not self.__get_stop_flag__():
            self.dataset.build_metadata()
            self.search_session.save_session(
                os.path.join(self.dataset.get_root_folder(),
                             "search_session.ses"))

            self.__set_status__(SERVICE_FILTERING_DATA)
            # TODO: Invoke a filter for the data at this stage (if wanted)
            # It may be a good idea because it hasn't been packaged yet, however it may increase the load
            # of the machine.
            # The dataset content's are stored in self.dataset
            # The dataset folder is self.dataset.get_root_folder()
            # The metadata ground truth is located in self.dataset.get_metadata_file()

            self.__set_status__(SERVICE_COMPRESSING_DATA)
            self._make_archive()
            filename = "{}.zip".format(self.dataset.get_name())

            self.__set_status__(SERVICE_PUBLISHING_DATA)
            move("./{}".format(filename),
                 os.path.join(self.publish_dir, filename))

            rmtree(self.dataset.get_root_folder())
            self.__set_status__(SERVICE_CREATED_DATASET)

        del self.dataset

        if self.autoclose_search_session_on_exit:
            self.search_session.stop()

        if self.on_finished:
            self.on_finished(self.get_dataset_name())
예제 #9
0
 def start(self):
     logging.info("Crawler started digesting requests.")
     Service.start(self)
     self.resume()
예제 #10
0
 def stop(self, wait_for_finish=True):
     print("Stop of crawler service requested")
     logging.info("Crawler stopped from digesting requests.")
     Service.stop(self, wait_for_finish)
예제 #11
0
 def __init__(self, to_folder):
     FetchPool.__init__(self, pool_limit=20)
     Service.__init__(self)
     self.database = MemDatabase(to_folder)
예제 #12
0
    def stop(self, wait_for_finish=True):
        Service.stop(self, wait_for_finish=wait_for_finish)

        if self.crawler_service:
            self.crawler_service.stop(wait_for_finish=wait_for_finish)
예제 #13
0
    def __init__(self, manager):
        Service.__init__(self)

        self.global_lock = Lock()
        self.status_table = manager.dict()
        self.manager = manager