def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get( 'QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {}
def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work)
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {}
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlingStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update(map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1
class ScoringWorker(object): def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work) def work(self): consumed = 0 batch = [] fingerprints = set() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update(map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) finally:
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KafkaProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=snappy) self._in_consumer = KafkaConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = KafkaConsumer( self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = LocalFrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlingStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
class FrontierWorker(object): def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {} def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) reactor.run() def consume_incoming(self, *args, **kwargs): consumed = 0 try: for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: ', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.info("Request error %s", request.url) self._backend.request_error(request, error) finally: consumed += 1
class FrontierWorker(object): def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get( 'QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {} def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def stop(self): logger.info("Stopping frontier manager.") self._manager.stop() def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False def consume_incoming(self, *args, **kwargs): consumed = 0 for m in self.spider_log_consumer.get_messages( timeout=1.0, count=self.consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: ', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.info("Request error %s", request.url) self._backend.request_error(request, error) if type == 'offset': _, partition_id, offset = msg try: producer_offset = self.spider_feed_producer.get_offset( partition_id) except KeyError: continue else: lag = producer_offset - offset if lag < 0: # non-sense in general, happens when SW is restarted and not synced yet with Spiders. continue if lag < self.max_next_requests or offset == 0: self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) finally:
class FrontierWorker(object): def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KafkaProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=snappy) self._in_consumer = KafkaConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = KafkaConsumer( self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = LocalFrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {} def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) reactor.run() def consume_incoming(self, *args, **kwargs): consumed = 0 try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: ', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.info("Request error %s", request.url) self._backend.request_error(request, error) finally: consumed += 1 except OffsetOutOfRangeError as e: # https://github.com/mumrah/kafka-python/issues/263 self._in_consumer.seek(0, 2) # moving to the tail of the log logger.info( "Caught OffsetOutOfRangeError, moving to the tail of the log.") logger.info("Consumed %d items.", consumed) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() self.slot.schedule() return consumed def consume_scoring(self, *args, **kwargs): consumed = 0 try: batch = {} for m in self._scoring_consumer.get_messages(count=1024): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: if msg[0] == 'update_score': _, fprint, score, url, schedule = msg batch[fprint] = (score, url, schedule) if msg[0] == 'new_job_id': self.job_id = msg[1] finally: consumed += 1 self._backend.update_score(batch) except OffsetOutOfRangeError as e: # https://github.com/mumrah/kafka-python/issues/263 self._scoring_consumer.seek(0, 2) # moving to the tail of the log logger.info( "Caught OffsetOutOfRangeError, moving to the tail of the log.") logger.info("Consumed %d items during scoring consumption.", consumed) self.stats['last_consumed_scoring'] = consumed self.stats['last_consumption_run_scoring'] = asctime() self.slot.schedule() def new_batch(self, *args, **kwargs): lags = self._offset_fetcher.get() logger.info("Got lags %s" % str(lags)) partitions = [] for partition, lag in lags.iteritems(): if lag < self.max_next_requests: partitions.append(partition) logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) if not partitions: return 0 count = 0 for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception as e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count += 1 try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast( request.url) except Exception as e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) encoded_name = name.encode('utf-8', 'ignore') self._producer.send_messages(self.outgoing_topic, encoded_name, eo) logger.info("Pushed new batch of %d items", count) self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False
class FrontierWorker(object): def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = {} def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def stop(self): logger.info("Stopping frontier manager.") self._manager.stop() def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False def consume_incoming(self, *args, **kwargs): consumed = 0 for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: ', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.info("Request error %s", request.url) self._backend.request_error(request, error) if type == 'offset': _, partition_id, offset = msg try: producer_offset = self.spider_feed_producer.get_offset(partition_id) except KeyError: continue else: lag = producer_offset - offset if lag < 0: # non-sense in general, happens when SW is restarted and not synced yet with Spiders. continue if lag < self.max_next_requests or offset == 0: self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) finally:
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1 except OffsetOutOfRangeError as e: # https://github.com/mumrah/kafka-python/issues/263 self._in_consumer.seek(0, 2) # moving to the tail of the log logger.info( "Caught OffsetOutOfRangeError, moving to the tail of the log.") self.backend.fetch_states(list(fingerprints)) fingerprints.clear() results = [] for msg in batch: if len(results) > 1024: self._producer.send_messages(self.outgoing_topic, *results) results = [] type = msg[0] if type == 'add_seeds': _, seeds = msg for seed in seeds: seed.meta['jid'] = self.job_id results.extend(self.on_add_seeds(seeds)) continue if type == 'page_crawled': _, response, links = msg if response.meta['jid'] != self.job_id: continue results.extend(self.on_page_crawled(response, links)) continue if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue results.extend(self.on_request_error(request, error)) continue if len(results): self._producer.send_messages(self.outgoing_topic, *results) if self.cache_flush_counter == 30: logger.info("Flushing states") self.backend.flush_states(is_clear=False) logger.info("Flushing states finished") self.cache_flush_counter = 0 self.cache_flush_counter += 1 if self.strategy.finished(): logger.info("Succesfully reached the crawling goal. Exiting.") exit(0) logger.info("Consumed %d items.", consumed) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() def run(self): while True: self.work() def on_add_seeds(self, seeds): logger.info('Adding %i seeds', len(seeds)) seed_map = dict( map(lambda seed: (seed.meta['fingerprint'], seed), seeds)) self.backend.update_states(seeds, False) scores = self.strategy.add_seeds(seeds) self.backend.update_states(seeds, True) output = [] for fingerprint, score in scores.iteritems(): seed = seed_map[fingerprint] logger.debug('URL: %s', seed.url) if score is not None: encoded = self._encoder.encode_update_score( seed.meta['fingerprint'], score, seed.url, True) output.append(encoded) return output def on_page_crawled(self, response, links): logger.debug("Page crawled %s", response.url) objs_list = [response] objs_list.extend(links) objs = dict(map(lambda obj: (obj.meta['fingerprint'], obj), objs_list)) self.backend.update_states(objs_list, False) scores = self.strategy.page_crawled(response, links) self.backend.update_states(objs_list, True) output = [] for fingerprint, score in scores.iteritems(): obj = objs[fingerprint] if score is not None: encoded = self._encoder.encode_update_score( obj.meta['fingerprint'], score, obj.url, True) output.append(encoded) return output def on_request_error(self, request, error): self.backend.update_states(request, False) scores = self.strategy.page_error(request, error) self.backend.update_states(request, True) assert len(scores) == 1 fingerprint, score = scores.popitem() if score is not None: encoded = self._encoder.encode_update_score( request.meta['fingerprint'], score, request.url, False) return [encoded] return []
class ScoringWorker(object): def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work) def work(self): consumed = 0 batch = [] fingerprints = set() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) finally:
class FrontierWorker(object): def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer( self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {} def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) reactor.run() def consume_incoming(self, *args, **kwargs): consumed = 0 try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: ', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.info("Request error %s", request.url) self._backend.request_error(request, error) finally: consumed += 1