def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {} self.job_id = 0 self.task = LoopingCall(self.work)
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {'consumed_since_start': 0} self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) logger.info( "Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) self._encoder = encoder_cls(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { 'consumed_since_start': 0 } self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get( 'QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {}
def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work)
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' return FrontierManager.from_settings(settings)
def __init__(self, settings): assert self.request_converter_class, 'request_converter_class not defined' assert self.response_converter_class, 'response_converter_class not defined' assert issubclass(self.request_converter_class, BaseRequestConverter), 'request_converter_class ' \ 'must subclass RequestConverter' assert issubclass(self.response_converter_class, BaseResponseConverter), 'response_converter_class ' \ 'must subclass RequestConverter' self.request_converter = self.request_converter_class self.response_converter = self.response_converter_class self.manager = FrontierManager.from_settings(settings)
def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' return FrontierManager.from_settings(settings)
def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) scoring_log = mb.scoring_log() self.add_seeds_mode = is_add_seeds_mode if not self.add_seeds_mode: spider_log = mb.spider_log() self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') self.consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) self._encoder = encoder_cls(self._manager.request_model) self.update_score = UpdateScoreStream(self.scoring_log_producer, self._encoder) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, strategy_args, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { 'consumed_since_start': 0, 'consumed_add_seeds': 0, 'consumed_page_crawled': 0, 'consumed_links_extracted': 0, 'consumed_request_error': 0, 'dropped_links_extracted': 0, } self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info( "Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, settings, no_batches, no_incoming, no_scoring): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend) and not no_scoring: scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_disabled = False else: self.strategy_disabled = True self.spider_log_consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_consumer_batch_size = settings.get( 'SCORING_LOG_CONSUMER_BATCH_SIZE') if settings.get('QUEUE_HOSTNAME_PARTITIONING'): self.logger.warning( 'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.' ) settings.set( 'SPIDER_FEED_PARTITIONER', 'frontera.contrib.backends.partitioners.Crc32NamePartitioner') self.partitioner_cls = load_object( settings.get('SPIDER_FEED_PARTITIONER')) self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_disabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): messagebus = load_object(settings.get('MESSAGE_BUS')) self.message_bus = messagebus(settings) self._manager = FrontierManager.from_settings(settings, db_worker=True) self.backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) slot_kwargs = {'no_batches': no_batches, 'no_incoming': no_incoming, 'no_scoring': no_scoring} slot_kwargs.update(**kwargs) self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) self.job_id = 0 self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer( self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlingStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
def __init__(self, settings): self.manager = FrontierManager.from_settings(settings)
def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') return RedisQueue(FrontierManager.from_settings(settings), get_pool(), partitions, True)
def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') settings.set('SPIDER_FEED_PARTITIONS', partitions) settings.set('REDIS_DROP_ALL_TABLES', True) return RedisBackend.db_worker(FrontierManager.from_settings(settings))