def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer( self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id)
def test_name_error(self): with pytest.raises(NameError) as info: load_object( 'frontera.tests.mocks.load_objects.non_existent_object') assert info.value.message == ( "Module 'frontera.tests.mocks.load_objects' doesn't define" " any object named 'non_existent_object'")
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) self._encoder = encoder_cls(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { 'consumed_since_start': 0 } self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): messagebus = load_object(settings.get('MESSAGE_BUS')) self.message_bus = messagebus(settings) self._manager = WorkerFrontierManager.from_settings(settings, db_worker=True) self.backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) slot_kwargs = { 'no_batches': no_batches, 'no_incoming': no_incoming, 'no_scoring': no_scoring } slot_kwargs.update(**kwargs) self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) self.job_id = 0 self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, settings): self.topic_todo = settings.get('SPIDER_FEED_TOPIC') self.topic_done = settings.get('SPIDER_LOG_TOPIC') self.topic_scoring = settings.get('SCORING_LOG_TOPIC') self.spiderlog_dbw_group = settings.get('SPIDER_LOG_DBW_GROUP') self.spiderlog_sw_group = settings.get('SPIDER_LOG_SW_GROUP') self.scoringlog_dbw_group = settings.get('SCORING_LOG_DBW_GROUP') self.spider_feed_group = settings.get('SPIDER_FEED_GROUP') self.spider_partition_id = settings.get('SPIDER_PARTITION_ID') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.codec = settings.get('KAFKA_CODEC') self.kafka_location = settings.get('KAFKA_LOCATION') if settings.get('QUEUE_HOSTNAME_PARTITIONING'): logger.warning( 'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.' ) settings.set( 'SPIDER_FEED_PARTITIONER', 'frontera.contrib.backends.partitioners.Crc32NamePartitioner') spider_log_partitions = list( range(settings.get('SPIDER_LOG_PARTITIONS'))) spider_log_partitioner_cls = load_object( settings.get('SPIDER_LOG_PARTITIONER')) self.spider_log_partitioner = spider_log_partitioner_cls( spider_log_partitions) spider_feed_partitions = list( range(settings.get('SPIDER_FEED_PARTITIONS'))) spider_feed_partitioner_cls = load_object( settings.get('SPIDER_FEED_PARTITIONER')) self.spider_feed_partitioner = spider_feed_partitioner_cls( spider_feed_partitions)
def test_import_error(self): with pytest.raises(ImportError) as info: load_object('frontera.non_existent_module.object') if six.PY2: assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'" ": No module named non_existent_module") else: assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'" ": No module named 'frontera.non_existent_module'")
def __init__(self, settings, no_batches, no_incoming, no_scoring): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend) and not no_scoring: scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_disabled = False else: self.strategy_disabled = True self.spider_log_consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_consumer_batch_size = settings.get( 'SCORING_LOG_CONSUMER_BATCH_SIZE') if settings.get('QUEUE_HOSTNAME_PARTITIONING'): self.logger.warning( 'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.' ) settings.set( 'SPIDER_FEED_PARTITIONER', 'frontera.contrib.backends.partitioners.Crc32NamePartitioner') self.partitioner_cls = load_object( settings.get('SPIDER_FEED_PARTITIONER')) self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_disabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def _start_logger(self, klass, name, level, enabled, handlers): logger = klass(name=name, level=level, enabled=enabled) for handler in handlers: if isinstance(handler, six.string_types): handler = load_object(handler) logger.add_handler(handler) return logger
def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work)
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {} self.job_id = 0 self.task = LoopingCall(self.work)
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, strategy_class
def __init__(self, manager, pool, partitions, delete_all_keys=False): settings = manager.settings codec_path = settings.get('REDIS_BACKEND_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(manager.request_model) self._decoder = decoder_cls(manager.request_model, manager.response_model) self._redis = RedisOperation(pool) self._redis_pipeline = RedisPipeline(pool) self._partitions = [i for i in range(0, partitions)] self._partitioner = Crc32NamePartitioner(self._partitions) self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: self._redis.flushdb()
def __init__(self, manager): self.manager = manager # Get settings settings = manager.settings engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE) engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO', DEFAULT_ENGINE_ECHO) drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES', DEFAULT_DROP_ALL_TABLES) clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT', DEFAULT_CLEAR_CONTENT) models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS) # Create engine self.engine = create_engine(engine, echo=engine_echo) # Load models self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) # Drop tables if we have to if drop_all_tables: Base.metadata.drop_all(self.engine) Base.metadata.create_all(self.engine) # Create session self.Session = sessionmaker() self.Session.configure(bind=self.engine) self.session = self.Session() # Clear content if we have to if clear_content: for name, table in Base.metadata.tables.items(): self.session.execute(table.delete())
def __init__(self, manager): self.manager = manager settings = manager.settings engine = settings.get('SQLALCHEMYBACKEND_ENGINE') engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') models = settings.get('SQLALCHEMYBACKEND_MODELS') self.engine = create_engine(engine, echo=engine_echo) self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) if drop_all_tables: DeclarativeBase.metadata.drop_all(self.engine) DeclarativeBase.metadata.create_all(self.engine) self.session_cls = sessionmaker() self.session_cls.configure(bind=self.engine) if clear_content: session = self.session_cls() for name, table in DeclarativeBase.metadata.tables.items(): session.execute(table.delete()) session.close() self._metadata = Metadata(self.session_cls, self.models['MetadataModel'], settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) self._states = States(self.session_cls, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) self._queue = self._create_queue(settings)
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get( 'QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {}
def _load_object(self, obj_class_name, silent=False): obj_class = load_object(obj_class_name) try: return self._load_frontier_object(obj_class) except NotConfigured: if not silent: raise NotConfigured
def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') partitions = list(range(settings.get('SPIDER_FEED_PARTITIONS'))) partitioner_cls = load_object(settings.get('SPIDER_FEED_PARTITIONER')) self.partitioner = partitioner_cls(partitions) host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({'protocol': 'compact', 'transport': 'framed'}) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, manager): self.manager = manager settings = manager.settings cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') # Format: ['192.168.0.1', '192.168.0.2'] cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') # Default: true models = settings.get('CASSANDRABACKEND_MODELS') self.cluster = Cluster(cluster_ips, cluster_port) self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) self.session = self.cluster.connect() self.session.row_factory = dict_factory if keyspace_create: query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) self.session.execute(query) self.session.set_keyspace(keyspace) connection.set_session(self.session) self._metadata = None self._queue = None self._states = None
def __init__(self, crawler): settings = ScrapySettingsAdapter(crawler.settings) self.partition_id = settings.get('SPIDER_PARTITION_ID') # XXX this can be improved later by reusing spider's producer # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) # but the topic is hard-coded in the current scheme, so it requires some # preliminary changes in Frontera itself. message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) stats_log = message_bus.stats_log() if not stats_log: raise NotConfigured self.stats_producer = stats_log.producer() self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") self._stats_encoder = encoder_cls(request_model=None) # no need to encode requests self._export_stats_task = None
def __init__(self, crawler): settings = ScrapySettingsAdapter(crawler.settings) self.partition_id = settings.get('SPIDER_PARTITION_ID') # XXX this can be improved later by reusing spider's producer # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) # but the topic is hard-coded in the current scheme, so it requires some # preliminary changes in Frontera itself. message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) stats_log = message_bus.stats_log() if not stats_log: raise NotConfigured self.stats_producer = stats_log.producer() self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") self._stats_encoder = encoder_cls( request_model=None) # no need to encode requests self._export_stats_task = None
def __init__(self, strategy_class, strategy_args, scoring_stream): self._scoring_stream = scoring_stream if scoring_stream else LocalUpdateScoreStream( self.backend.queue) self._states_context = StatesContext(self.backend.states) if isinstance(strategy_class, str): strategy_class = load_object(strategy_class) self._strategy = strategy_class.from_worker(self, strategy_args, self._scoring_stream, self._states_context)
def __init__(self, settings, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) scoring_log = mb.scoring_log() self.add_seeds_mode = is_add_seeds_mode if not self.add_seeds_mode: spider_log = mb.spider_log() self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') self.consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") request_model = load_object(settings.get('REQUEST_MODEL')) response_model = load_object(settings.get('RESPONSE_MODEL')) self._decoder = decoder_cls(request_model, response_model) self._encoder = encoder_cls(request_model) self.update_score = MessageBusUpdateScoreStream( self.scoring_log_producer, self._encoder) manager = WorkerFrontierManager.from_settings( settings, strategy_worker=True, scoring_stream=self.update_score) self.consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.stats = defaultdict(int) self.backend = manager.backend self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info( "Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, manager): self.manager = manager settings = manager.settings self._metadata = MemoryMetadata() self._states = MemoryStates(settings.get("STATE_CACHE_SIZE")) partitions = list(range(settings.get('SPIDER_FEED_PARTITIONS'))) partitioner_cls = load_object(settings.get('SPIDER_FEED_PARTITIONER')) self._partitioner = partitioner_cls(partitions) self._queue = self._create_queue(settings) self._id = 0
def __init__(self, settings, *args, **kwargs): super(StatsExportMixin, self).__init__(settings, *args, **kwargs) message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) stats_log = message_bus.stats_log() # FIXME can be removed after implementing stats_log for ZeroMQ bus if not stats_log: return self.stats_producer = stats_log.producer() self._stats_tags = self.get_stats_tags(settings, *args, **kwargs) self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) self._export_stats_task = LoopingCall(self.export_stats)
def __init__(self, settings=None): self.settings = settings or Settings() self.stats = load_object(self.settings['STATS_CLASS'])(self) dummy_class = type('class', (object,), {}) downloader = dummy_class() downloader.slots = {} downloader.domain_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_DOMAIN') downloader.ip_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_IP') self.engine = dummy_class() self.engine.downloader = downloader self.engine.downloader.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
def __init__(self, request_model, response_model, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = logging.getLogger("manager") # Log frontier manager starting self._logger.info('-' * 80) self._logger.info('Starting Frontier Manager...') # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__
def __init__(self, request_model, response_model, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = logging.getLogger("manager") # Log frontier manager starting self._logger.info('-'*80) self._logger.info('Starting Frontier Manager...') # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__
def __init__(self, manager): self.manager = manager settings = manager.settings engine = settings.get('SQLALCHEMYBACKEND_ENGINE') engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') models = settings.get('SQLALCHEMYBACKEND_MODELS') self.engine = create_engine(engine, echo=engine_echo) self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) self.session_cls = sessionmaker() self.session_cls.configure(bind=self.engine) self._metadata = None self._queue = None self._states = None
def __init__(self, settings): self.logger = getLogger("messagebus.zeromq") self.context = Context() self.socket_config = SocketConfig(settings.get('ZMQ_ADDRESS'), settings.get('ZMQ_BASE_PORT')) self.spider_partition = settings.get('SPIDER_PARTITION_ID') if settings.get('QUEUE_HOSTNAME_PARTITIONING'): self.logger.warning( 'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.' ) settings.set( 'SPIDER_FEED_PARTITIONER', 'frontera.contrib.backends.partitioners.Crc32NamePartitioner') self.spider_log_partitions = [ i for i in range(settings.get('SPIDER_LOG_PARTITIONS')) ] spider_log_partitioner_cls = load_object( settings.get('SPIDER_LOG_PARTITIONER')) self.spider_log_partitioner = spider_log_partitioner_cls( self.spider_log_partitions) self.spider_feed_partitions = [ i for i in range(settings.get('SPIDER_FEED_PARTITIONS')) ] spider_feed_partitioner_cls = load_object( settings.get('SPIDER_FEED_PARTITIONER')) self.spider_feed_partitioner = spider_feed_partitioner_cls( self.spider_feed_partitions) self.spider_feed_sndhwm = int( settings.get('MAX_NEXT_REQUESTS') * len(self.spider_feed_partitions) * 1.2) self.spider_feed_rcvhwm = int(settings.get('MAX_NEXT_REQUESTS') * 2.0) self.max_next_requests = int(settings.get('MAX_NEXT_REQUESTS')) if self.socket_config.is_ipv6: self.context.zeromq.setsockopt(zmq.IPV6, True)
def __init__(self, request_model, response_model, logger, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = load_object(logger)(self._settings) assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \ self._logger.__class__.__name__ # Log frontier manager starting self.logger.manager.debug('-' * 80) self.logger.manager.debug('Starting Frontier Manager...') # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__
def __init__(self, request_model, response_model, logger, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = load_object(logger)(self._settings) assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \ self._logger.__class__.__name__ # Log frontier manager starting self.logger.manager.debug('-'*80) self.logger.manager.debug('Starting Frontier Manager...') # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__
def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): messagebus = load_object(settings.get('MESSAGE_BUS')) self.message_bus = messagebus(settings) self._manager = WorkerFrontierManager.from_settings(settings, db_worker=True) self.backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) slot_kwargs = {'no_batches': no_batches, 'no_incoming': no_incoming, 'no_scoring': no_scoring} slot_kwargs.update(**kwargs) self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) self.job_id = 0 self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, manager): self.manager = manager settings = manager.settings cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') models = settings.get('CASSANDRABACKEND_MODELS') crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS') self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) self.cluster = Cluster( contact_points=cluster_ips, port=cluster_port, compression=True, default_retry_policy=RetryPolicy(), reconnection_policy=ConstantReconnectionPolicy(10, 100) ) self.session = self.cluster.connect() self.session.row_factory = dict_factory self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection self.crawl_id = crawl_id self.generate_stats = generate_stats if keyspace_create: query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) self.session.execute(query) self.session.set_keyspace(keyspace) connection.set_session(self.session) if drop_all_tables: for key, value in self.models.iteritems(): drop_table(value) for key, value in self.models.iteritems(): if (self.generate_stats is False and key != 'CrawlStatsModel') or self.generate_stats is True: sync_table(value) self._metadata = Metadata(self.session, self.models['MetadataModel'], self.crawl_id, self.generate_stats) self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT'), self.crawl_id) self._queue = self._create_queue(settings)
def __init__(self, manager): self._manager = manager settings = self._manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id)
def __init__(self, settings, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) scoring_log = mb.scoring_log() self.add_seeds_mode = is_add_seeds_mode if not self.add_seeds_mode: spider_log = mb.spider_log() self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") request_model = load_object(settings.get('REQUEST_MODEL')) response_model = load_object(settings.get('RESPONSE_MODEL')) self._decoder = decoder_cls(request_model, response_model) self._encoder = encoder_cls(request_model) self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder) manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True, scoring_stream=self.update_score) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.stats = defaultdict(int) self.backend = manager.backend self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
def _load_backend(self, backend, db_worker, strategy_worker): cls = load_object(backend) assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ if issubclass(cls, DistributedBackend): if db_worker: return cls.db_worker(self) if strategy_worker: return cls.strategy_worker(self) raise RuntimeError("Distributed backends are meant to be used in workers.") else: assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ "subclasses are allowed to use." if hasattr(cls, 'from_manager'): return cls.from_manager(self) else: return cls()
def _load_backend(self, backend, db_worker, strategy_worker): # FIXME remove obsolete cls = load_object(backend) assert issubclass( cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ if issubclass(cls, DistributedBackend): if db_worker: return cls.db_worker(self) if strategy_worker: return cls.strategy_worker(self) return cls.local(self) else: assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ "subclasses are allowed to use" if hasattr(cls, 'from_manager'): return cls.from_manager(self) else: return cls()
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)
def test_load_variable(self): obj = load_object('tests.mocks.load_objects.mock_variable') assert obj == 'test'
parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class)
def test_load_class(self): obj = load_object('tests.mocks.load_objects.MockClass') assert obj.val == 10
def test_load_instance(self): obj = load_object('tests.mocks.load_objects.mock_instance') assert obj.val == 5
def test_load_function(self): obj = load_object('tests.mocks.load_objects.mock_function') assert obj() == 2
def test_value_error(self): with pytest.raises(ValueError) as info: load_object('frontera') assert str(info.value) == "Error loading object 'frontera': not a full path"
def test_name_error(self): with pytest.raises(NameError) as info: load_object('tests.mocks.load_objects.non_existent_object') assert str(info.value) == ("Module 'tests.mocks.load_objects' doesn't define" " any object named 'non_existent_object'")