示例#1
0
 def __init__(self, collection_set_id=None, mq_config=None):
     BaseConsumer.__init__(self, mq_config=mq_config)
     self.collection_set_id = collection_set_id
     if self.collection_set_id:
         log.info("Limiting to collection sets %s", self.collection_set_id)
     else:
         log.info("Not limiting by collection set.")
示例#2
0
    def __init__(self, working_path, mq_config=None, stream_restart_interval_secs=30 * 60, debug=False,
                 use_warcprox=True, queue_warc_files_interval_secs=5 * 60, warc_rollover_secs=30 * 60,
                 debug_warcprox=False, tries=3, host=None):
        BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config, persist_messages=True)
        self.stream_restart_interval_secs = stream_restart_interval_secs
        self.is_streaming = False
        self.routing_key = ""
        self.warc_temp_dir = None
        self.stop_harvest_seeds_event = None
        self.stop_harvest_loop_event = None
        self.restart_stream_timer = None
        self.state_store = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.use_warcprox = use_warcprox
        self.warc_processing_queue = Queue()
        self.result_filepath = None
        self.queue_warc_files_interval_secs = queue_warc_files_interval_secs
        self.queue_warc_files_timer = None
        self.warc_rollover_secs = warc_rollover_secs
        self.tries = tries

        # Create and start warc processing thread.
        self.warc_processing_thread = threading.Thread(target=self._process_warc_thread, name="warc_processing_thread")
        self.warc_processing_thread.daemon = True
        self.warc_processing_thread.start()
        self.host = host or os.environ.get("HOSTNAME", "localhost")
示例#3
0
 def __init__(self, collection_set_id=None, mq_config=None):
     BaseConsumer.__init__(self, mq_config=mq_config)
     self.collection_set_id = collection_set_id
     if self.collection_set_id:
         log.info("Limiting to collection sets %s", self.collection_set_id)
     else:
         log.info("Not limiting by collection set.")
示例#4
0
 def __init__(self, working_path, raise_exception=False):
     BaseConsumer.__init__(self, persist_messages=True, working_path=working_path)
     self.raise_exception = raise_exception
     self.on_message_called = False
     self.on_message_called = None
     self.on_message_message = None
     self.on_message_routing_key = None
     self.on_message_file_message = None
示例#5
0
 def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None,
              limit_item_types=None, host=None):
     BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True)
     self.api_client = ApiClient(api_base_url)
     self.warc_iter_cls = warc_iter_cls
     self.table_cls = table_cls
     self.limit_item_types = limit_item_types
     # This is for unit tests only.
     self.warc_base_path = warc_base_path
     self.host = host or os.environ.get("HOSTNAME", "localhost")
 def __init__(self,
              working_path,
              raise_exception=False,
              cause_persist_exception=False):
     BaseConsumer.__init__(self,
                           persist_messages=True,
                           working_path=working_path)
     self.raise_exception = raise_exception
     self.on_message_called = False
     self.on_message_message = None
     self.on_message_routing_key = None
     self.on_message_file_message = None
     if cause_persist_exception:
         self.message_filepath = None
     self.on_persist_exception_called = False
    def __init__(self, script, working_path, debug=False, mq_config=None, debug_warcprox=False, tries=3):
        BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config)
        # Add routing keys for harvest stop messages
        # The queue will be unique to this instance of StreamServer so that it
        # will receive all stop requests
        if mq_config:
            for queue, routing_keys in mq_config.queues.items():
                mq_config.queues["_".join([queue, socket.gethostname()])] = [routing_key.replace("start", "stop")
                                                                             for routing_key in routing_keys]
            log.debug("Queues are now %s", mq_config.queues)

        self.message = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.tries = tries
        self._supervisor = HarvestSupervisor(script, mq_config.host, mq_config.username, mq_config.password,
                                             working_path, debug=debug, process_owner="sfm")
    def __init__(self,
                 script,
                 working_path,
                 debug=False,
                 mq_config=None,
                 debug_warcprox=False,
                 tries=3):
        BaseConsumer.__init__(self,
                              working_path=working_path,
                              mq_config=mq_config)
        # Add routing keys for harvest stop messages
        # The queue will be unique to this instance of StreamServer so that it
        # will receive all stop requests
        if mq_config:
            for queue, routing_keys in list(mq_config.queues.items()):
                mq_config.queues["_".join([queue, socket.gethostname()])] = [
                    routing_key.replace("start", "stop")
                    for routing_key in routing_keys
                ]
            log.debug("Queues are now %s", mq_config.queues)

        self.message = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.tries = tries
        self._supervisor = HarvestSupervisor(script,
                                             mq_config.host,
                                             mq_config.username,
                                             mq_config.password,
                                             working_path,
                                             debug=debug,
                                             process_owner="sfm")

        # Shutdown Supervisor.
        def shutdown(signal_number, stack_frame):
            log.debug("Shutdown triggered")
            self._supervisor.pause_all()
            self.should_stop = True

        log.debug("Registering shutdown signal")

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)
示例#9
0
 def __init__(self,
              api_base_url,
              warc_iter_cls,
              table_cls,
              working_path,
              mq_config=None,
              warc_base_path=None,
              limit_item_types=None,
              host=None):
     BaseConsumer.__init__(self,
                           mq_config=mq_config,
                           working_path=working_path,
                           persist_messages=True)
     self.api_client = ApiClient(api_base_url)
     self.warc_iter_cls = warc_iter_cls
     self.table_cls = table_cls
     self.limit_item_types = limit_item_types
     # This is for unit tests only.
     self.warc_base_path = warc_base_path
     self.host = host or os.environ.get("HOSTNAME", "localhost")
示例#10
0
    def __init__(self,
                 working_path,
                 mq_config=None,
                 stream_restart_interval_secs=30 * 60,
                 debug=False,
                 use_warcprox=True,
                 queue_warc_files_interval_secs=5 * 60,
                 warc_rollover_secs=30 * 60,
                 debug_warcprox=False,
                 tries=3,
                 host=None):
        BaseConsumer.__init__(self,
                              working_path=working_path,
                              mq_config=mq_config,
                              persist_messages=True)
        self.stream_restart_interval_secs = stream_restart_interval_secs
        self.is_streaming = False
        self.routing_key = ""
        self.warc_temp_dir = None
        self.stop_harvest_seeds_event = threading.Event()
        self.stop_harvest_loop_event = threading.Event()
        self.restart_stream_timer = None
        self.state_store = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.use_warcprox = use_warcprox
        self.warc_processing_queue = Queue()
        self.result_filepath = None
        self.queue_warc_files_interval_secs = queue_warc_files_interval_secs
        self.queue_warc_files_timer = None
        self.warc_rollover_secs = warc_rollover_secs
        self.tries = tries

        # Create and start warc processing thread.
        self.warc_processing_thread = threading.Thread(
            target=self._process_warc_thread, name="warc_processing_thread")
        self.warc_processing_thread.daemon = True
        self.warc_processing_thread.start()
        self.host = host or os.environ.get("HOSTNAME", "localhost")
        # Indicates that the next shutdown should be treated as a pause of the harvest, rather than a completion.
        self.is_pause = False
示例#11
0
 def __init__(self,
              data_filepath,
              wb_collection_name="sfm",
              mq_config=None):
     BaseConsumer.__init__(self, mq_config=mq_config)
     self.data_filepath = data_filepath
     self.wb_collection_name = wb_collection_name
     self.collection_filepath = os.path.join(self.data_filepath,
                                             "collections",
                                             wb_collection_name)
     if not os.path.exists(self.data_filepath):
         log.info("Creating %s", self.data_filepath)
         os.makedirs(self.data_filepath)
     if not os.path.exists(self.collection_filepath):
         log.info("Initing %s", self.wb_collection_name)
         check_output("wb-manager init {}".format(self.wb_collection_name),
                      shell=True,
                      cwd=self.data_filepath)
         # Create empty index file
         open(os.path.join(self.collection_filepath, "indexes/index.cdxj"),
              'w').close()