Exemplo n.º 1
0
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
Exemplo n.º 2
0
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get(
            'QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring,
                         no_batches, self.strategy_enabled,
                         settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {}
Exemplo n.º 3
0
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)
Exemplo n.º 4
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KafkaProducer(self._kafka,
                                       partitioner=Crc32NamePartitioner,
                                       codec=snappy)

        self._in_consumer = KafkaConsumer(self._kafka,
                                          settings.get('FRONTIER_GROUP'),
                                          settings.get('INCOMING_TOPIC'),
                                          buffer_size=1048576,
                                          max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = KafkaConsumer(
                self._kafka,
                settings.get('FRONTIER_GROUP'),
                settings.get('SCORING_TOPIC'),
                buffer_size=262144,
                max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka,
                                       settings.get('OUTGOING_TOPIC'),
                                       settings.get('FRONTIER_GROUP'))

        self._manager = LocalFrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
Exemplo n.º 5
0
class FrontierWorker(object):
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KafkaProducer(self._kafka,
                                       partitioner=Crc32NamePartitioner,
                                       codec=snappy)

        self._in_consumer = KafkaConsumer(self._kafka,
                                          settings.get('FRONTIER_GROUP'),
                                          settings.get('INCOMING_TOPIC'),
                                          buffer_size=1048576,
                                          max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = KafkaConsumer(
                self._kafka,
                settings.get('FRONTIER_GROUP'),
                settings.get('SCORING_TOPIC'),
                buffer_size=262144,
                max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka,
                                       settings.get('OUTGOING_TOPIC'),
                                       settings.get('FRONTIER_GROUP'))

        self._manager = LocalFrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}

    def set_process_info(self, process_info):
        self.process_info = process_info

    def run(self):
        self.slot.schedule(on_start=True)
        reactor.run()

    def consume_incoming(self, *args, **kwargs):
        consumed = 0
        try:
            for m in self._in_consumer.get_messages(
                    count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError) as e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    if type == 'add_seeds':
                        _, seeds = msg
                        logger.info('Adding %i seeds', len(seeds))
                        for seed in seeds:
                            logger.debug('URL: ', seed.url)
                        self._backend.add_seeds(seeds)
                    if type == 'page_crawled':
                        _, response, links = msg
                        logger.debug("Page crawled %s", response.url)
                        if response.meta['jid'] != self.job_id:
                            continue
                        self._backend.page_crawled(response, links)
                    if type == 'request_error':
                        _, request, error = msg
                        if request.meta['jid'] != self.job_id:
                            continue
                        logger.info("Request error %s", request.url)
                        self._backend.request_error(request, error)
                finally:
                    consumed += 1
        except OffsetOutOfRangeError as e:
            # https://github.com/mumrah/kafka-python/issues/263
            self._in_consumer.seek(0, 2)  # moving to the tail of the log
            logger.info(
                "Caught OffsetOutOfRangeError, moving to the tail of the log.")

        logger.info("Consumed %d items.", consumed)
        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()
        self.slot.schedule()
        return consumed

    def consume_scoring(self, *args, **kwargs):
        consumed = 0
        try:
            batch = {}
            for m in self._scoring_consumer.get_messages(count=1024):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError) as e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    if msg[0] == 'update_score':
                        _, fprint, score, url, schedule = msg
                        batch[fprint] = (score, url, schedule)
                    if msg[0] == 'new_job_id':
                        self.job_id = msg[1]
                finally:
                    consumed += 1
            self._backend.update_score(batch)
        except OffsetOutOfRangeError as e:
            # https://github.com/mumrah/kafka-python/issues/263
            self._scoring_consumer.seek(0, 2)  # moving to the tail of the log
            logger.info(
                "Caught OffsetOutOfRangeError, moving to the tail of the log.")

        logger.info("Consumed %d items during scoring consumption.", consumed)
        self.stats['last_consumed_scoring'] = consumed
        self.stats['last_consumption_run_scoring'] = asctime()
        self.slot.schedule()

    def new_batch(self, *args, **kwargs):
        lags = self._offset_fetcher.get()
        logger.info("Got lags %s" % str(lags))

        partitions = []
        for partition, lag in lags.iteritems():
            if lag < self.max_next_requests:
                partitions.append(partition)

        logger.info("Getting new batches for partitions %s" %
                    str(",").join(map(str, partitions)))
        if not partitions:
            return 0

        count = 0
        for request in self._backend.get_next_requests(self.max_next_requests,
                                                       partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception as e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" %
                             (e, request.meta['fingerprint'], request.url))
                continue
            finally:
                count += 1

            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(
                    request.url)
            except Exception as e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" %
                             (e, request.meta['fingerprint'], request.url))
            encoded_name = name.encode('utf-8', 'ignore')
            self._producer.send_messages(self.outgoing_topic, encoded_name, eo)
        logger.info("Pushed new batch of %d items", count)
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count

    def disable_new_batches(self):
        self.slot.disable_new_batches = True

    def enable_new_batches(self):
        self.slot.disable_new_batches = False
Exemplo n.º 6
0
class ScoringWorker(object):
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0

    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        try:
            for m in self._in_consumer.get_messages(
                    count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError) as e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    batch.append(msg)
                    if type == 'add_seeds':
                        _, seeds = msg
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], seeds))
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        fingerprints.add(response.meta['fingerprint'])
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], links))
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        fingerprints.add(request.meta['fingerprint'])
                        continue

                    raise TypeError('Unknown message type %s' % type)
                finally:
                    consumed += 1
        except OffsetOutOfRangeError as e:
            # https://github.com/mumrah/kafka-python/issues/263
            self._in_consumer.seek(0, 2)  # moving to the tail of the log
            logger.info(
                "Caught OffsetOutOfRangeError, moving to the tail of the log.")

        self.backend.fetch_states(list(fingerprints))
        fingerprints.clear()
        results = []
        for msg in batch:
            if len(results) > 1024:
                self._producer.send_messages(self.outgoing_topic, *results)
                results = []

            type = msg[0]
            if type == 'add_seeds':
                _, seeds = msg
                for seed in seeds:
                    seed.meta['jid'] = self.job_id
                results.extend(self.on_add_seeds(seeds))
                continue

            if type == 'page_crawled':
                _, response, links = msg
                if response.meta['jid'] != self.job_id:
                    continue
                results.extend(self.on_page_crawled(response, links))
                continue

            if type == 'request_error':
                _, request, error = msg
                if request.meta['jid'] != self.job_id:
                    continue
                results.extend(self.on_request_error(request, error))
                continue
        if len(results):
            self._producer.send_messages(self.outgoing_topic, *results)

        if self.cache_flush_counter == 30:
            logger.info("Flushing states")
            self.backend.flush_states(is_clear=False)
            logger.info("Flushing states finished")
            self.cache_flush_counter = 0

        self.cache_flush_counter += 1

        if self.strategy.finished():
            logger.info("Succesfully reached the crawling goal. Exiting.")
            exit(0)

        logger.info("Consumed %d items.", consumed)
        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()

    def run(self):
        while True:
            self.work()

    def on_add_seeds(self, seeds):
        logger.info('Adding %i seeds', len(seeds))
        seed_map = dict(
            map(lambda seed: (seed.meta['fingerprint'], seed), seeds))
        self.backend.update_states(seeds, False)
        scores = self.strategy.add_seeds(seeds)
        self.backend.update_states(seeds, True)

        output = []
        for fingerprint, score in scores.iteritems():
            seed = seed_map[fingerprint]
            logger.debug('URL: %s', seed.url)
            if score is not None:
                encoded = self._encoder.encode_update_score(
                    seed.meta['fingerprint'], score, seed.url, True)
                output.append(encoded)
        return output

    def on_page_crawled(self, response, links):
        logger.debug("Page crawled %s", response.url)
        objs_list = [response]
        objs_list.extend(links)
        objs = dict(map(lambda obj: (obj.meta['fingerprint'], obj), objs_list))
        self.backend.update_states(objs_list, False)
        scores = self.strategy.page_crawled(response, links)
        self.backend.update_states(objs_list, True)

        output = []
        for fingerprint, score in scores.iteritems():
            obj = objs[fingerprint]
            if score is not None:
                encoded = self._encoder.encode_update_score(
                    obj.meta['fingerprint'], score, obj.url, True)
                output.append(encoded)
        return output

    def on_request_error(self, request, error):
        self.backend.update_states(request, False)
        scores = self.strategy.page_error(request, error)
        self.backend.update_states(request, True)
        assert len(scores) == 1
        fingerprint, score = scores.popitem()
        if score is not None:
            encoded = self._encoder.encode_update_score(
                request.meta['fingerprint'], score, request.url, False)
            return [encoded]
        return []