def __init__(self, collection_set_id=None, mq_config=None): BaseConsumer.__init__(self, mq_config=mq_config) self.collection_set_id = collection_set_id if self.collection_set_id: log.info("Limiting to collection sets %s", self.collection_set_id) else: log.info("Not limiting by collection set.")
def __init__(self, working_path, mq_config=None, stream_restart_interval_secs=30 * 60, debug=False, use_warcprox=True, queue_warc_files_interval_secs=5 * 60, warc_rollover_secs=30 * 60, debug_warcprox=False, tries=3, host=None): BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config, persist_messages=True) self.stream_restart_interval_secs = stream_restart_interval_secs self.is_streaming = False self.routing_key = "" self.warc_temp_dir = None self.stop_harvest_seeds_event = None self.stop_harvest_loop_event = None self.restart_stream_timer = None self.state_store = None self.debug = debug self.debug_warcprox = debug_warcprox self.use_warcprox = use_warcprox self.warc_processing_queue = Queue() self.result_filepath = None self.queue_warc_files_interval_secs = queue_warc_files_interval_secs self.queue_warc_files_timer = None self.warc_rollover_secs = warc_rollover_secs self.tries = tries # Create and start warc processing thread. self.warc_processing_thread = threading.Thread(target=self._process_warc_thread, name="warc_processing_thread") self.warc_processing_thread.daemon = True self.warc_processing_thread.start() self.host = host or os.environ.get("HOSTNAME", "localhost")
def __init__(self, working_path, raise_exception=False): BaseConsumer.__init__(self, persist_messages=True, working_path=working_path) self.raise_exception = raise_exception self.on_message_called = False self.on_message_called = None self.on_message_message = None self.on_message_routing_key = None self.on_message_file_message = None
def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None, limit_item_types=None, host=None): BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True) self.api_client = ApiClient(api_base_url) self.warc_iter_cls = warc_iter_cls self.table_cls = table_cls self.limit_item_types = limit_item_types # This is for unit tests only. self.warc_base_path = warc_base_path self.host = host or os.environ.get("HOSTNAME", "localhost")
def __init__(self, working_path, raise_exception=False, cause_persist_exception=False): BaseConsumer.__init__(self, persist_messages=True, working_path=working_path) self.raise_exception = raise_exception self.on_message_called = False self.on_message_message = None self.on_message_routing_key = None self.on_message_file_message = None if cause_persist_exception: self.message_filepath = None self.on_persist_exception_called = False
def __init__(self, script, working_path, debug=False, mq_config=None, debug_warcprox=False, tries=3): BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config) # Add routing keys for harvest stop messages # The queue will be unique to this instance of StreamServer so that it # will receive all stop requests if mq_config: for queue, routing_keys in mq_config.queues.items(): mq_config.queues["_".join([queue, socket.gethostname()])] = [routing_key.replace("start", "stop") for routing_key in routing_keys] log.debug("Queues are now %s", mq_config.queues) self.message = None self.debug = debug self.debug_warcprox = debug_warcprox self.tries = tries self._supervisor = HarvestSupervisor(script, mq_config.host, mq_config.username, mq_config.password, working_path, debug=debug, process_owner="sfm")
def __init__(self, script, working_path, debug=False, mq_config=None, debug_warcprox=False, tries=3): BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config) # Add routing keys for harvest stop messages # The queue will be unique to this instance of StreamServer so that it # will receive all stop requests if mq_config: for queue, routing_keys in list(mq_config.queues.items()): mq_config.queues["_".join([queue, socket.gethostname()])] = [ routing_key.replace("start", "stop") for routing_key in routing_keys ] log.debug("Queues are now %s", mq_config.queues) self.message = None self.debug = debug self.debug_warcprox = debug_warcprox self.tries = tries self._supervisor = HarvestSupervisor(script, mq_config.host, mq_config.username, mq_config.password, working_path, debug=debug, process_owner="sfm") # Shutdown Supervisor. def shutdown(signal_number, stack_frame): log.debug("Shutdown triggered") self._supervisor.pause_all() self.should_stop = True log.debug("Registering shutdown signal") signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown)
def __init__(self, working_path, mq_config=None, stream_restart_interval_secs=30 * 60, debug=False, use_warcprox=True, queue_warc_files_interval_secs=5 * 60, warc_rollover_secs=30 * 60, debug_warcprox=False, tries=3, host=None): BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config, persist_messages=True) self.stream_restart_interval_secs = stream_restart_interval_secs self.is_streaming = False self.routing_key = "" self.warc_temp_dir = None self.stop_harvest_seeds_event = threading.Event() self.stop_harvest_loop_event = threading.Event() self.restart_stream_timer = None self.state_store = None self.debug = debug self.debug_warcprox = debug_warcprox self.use_warcprox = use_warcprox self.warc_processing_queue = Queue() self.result_filepath = None self.queue_warc_files_interval_secs = queue_warc_files_interval_secs self.queue_warc_files_timer = None self.warc_rollover_secs = warc_rollover_secs self.tries = tries # Create and start warc processing thread. self.warc_processing_thread = threading.Thread( target=self._process_warc_thread, name="warc_processing_thread") self.warc_processing_thread.daemon = True self.warc_processing_thread.start() self.host = host or os.environ.get("HOSTNAME", "localhost") # Indicates that the next shutdown should be treated as a pause of the harvest, rather than a completion. self.is_pause = False
def __init__(self, data_filepath, wb_collection_name="sfm", mq_config=None): BaseConsumer.__init__(self, mq_config=mq_config) self.data_filepath = data_filepath self.wb_collection_name = wb_collection_name self.collection_filepath = os.path.join(self.data_filepath, "collections", wb_collection_name) if not os.path.exists(self.data_filepath): log.info("Creating %s", self.data_filepath) os.makedirs(self.data_filepath) if not os.path.exists(self.collection_filepath): log.info("Initing %s", self.wb_collection_name) check_output("wb-manager init {}".format(self.wb_collection_name), shell=True, cwd=self.data_filepath) # Create empty index file open(os.path.join(self.collection_filepath, "indexes/index.cdxj"), 'w').close()