示例#1
0
    def __init__(self,
                 connection,
                 partitions,
                 table_name,
                 drop=False,
                 use_snappy=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            schema = {'f': {'max_versions': 1}}
            if use_snappy:
                schema['f']['compression'] = 'SNAPPY'
            self.connection.create_table(self.table_name, schema)

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
示例#2
0
    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(
                self.table_name,
                {'f': {
                    'max_versions': 1,
                    'block_cache_enabled': 1
                }})

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
示例#3
0
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)
示例#4
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder,
                                              self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager,
                                                   self.update_score,
                                                   self.states_context)
        self.states = self._manager.backend.states
        self.stats = {'consumed_since_start': 0}
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        logger.info(
            "Strategy worker is initialized and consuming partition %d",
            partition_id)
示例#5
0
文件: db.py 项目: thodison/frontera
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
示例#6
0
文件: db.py 项目: lopuhin/frontera
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
示例#7
0
    def __init__(self, manager):
        self._manager = manager
        settings = manager.settings

        # Kafka connection parameters
        self._server = settings.get('KAFKA_LOCATION')
        self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo")
        self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done")
        self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler")
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._partition_id = settings.get('SPIDER_PARTITION_ID')

        # Kafka setup
        self._conn = KafkaClient(self._server)
        self._prod = None
        self._cons = None

        logger = getLogger("kafka")
        handler = StreamHandler()
        logger.addHandler(handler)

        self._connect_consumer()
        self._connect_producer()

        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)
示例#8
0
    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}})

        class DumbResponse:
            pass
        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
示例#9
0
class HBaseQueue(Queue):

    GET_RETRIES = 3

    def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        schema = {'f': {'max_versions': 1}}
        if use_snappy:
            schema['f']['compression'] = 'SNAPPY'
        if self.table_name not in tables:
            self.connection.create_table(self.table_name, schema)

        class DumbResponse:
            pass
        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        pass

    def schedule(self, batch):
        to_schedule = dict()
        now = int(time())
        for fprint, score, request, schedule in batch:
            if schedule:
                if b'domain' not in request.meta:    # TODO: this have to be done always by DomainMiddleware,
                    # so I propose to require DomainMiddleware by HBaseBackend and remove that code
                    _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                    if not hostname:
                        self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                    request.meta[b'domain'] = {'name': hostname}
                timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now
                to_schedule.setdefault(timestamp, []).append((request, score))
        for timestamp, batch in six.iteritems(to_schedule):
            self._schedule(batch, timestamp)

    def _schedule(self, batch, timestamp):
        """
        Row - portion of the queue for each partition id created at some point in time
        Row Key - partition id + score interval + random_str
        Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...)
        Value - QueueCell msgpack blob

        Where score is mapped from 0.0 to 1.0
        score intervals are
          [0.01-0.02)
          [0.02-0.03)
          [0.03-0.04)
         ...
          [0.99-1.00]
        random_str - the time when links was scheduled for retrieval, microsecs

        :param batch: iterable of Request objects
        :return:
        """
        def get_interval(score, resolution):
            if score < 0.0 or score > 1.0:
                raise OverflowError

            i = int(score / resolution)
            if i % 10 == 0 and i > 0:
                i = i - 1  # last interval is inclusive from right
            return (i * resolution, (i + 1) * resolution)

        random_str = int(time() * 1E+6)
        data = dict()
        for request, score in batch:
            domain = request.meta[b'domain']
            fingerprint = request.meta[b'fingerprint']
            if type(domain) == dict:
                partition_id = self.partitioner.partition(domain[b'name'], self.partitions)
                host_crc32 = get_crc32(domain[b'name'])
            elif type(domain) == int:
                partition_id = self.partitioner.partition_by_hash(domain, self.partitions)
                host_crc32 = domain
            else:
                raise TypeError("domain of unknown type.")
            item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score)
            score = 1 - score  # because of lexicographical sort in HBase
            rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str)
            data.setdefault(rk, []).append((score, item))

        table = self.connection.table(self.table_name)
        with table.batch(transaction=True) as b:
            for rk, tuples in six.iteritems(data):
                obj = dict()
                for score, item in tuples:
                    column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001)
                    obj.setdefault(column, []).append(item)

                final = dict()
                packer = Packer()
                for column, items in six.iteritems(obj):
                    stream = BytesIO()
                    for item in items:
                        stream.write(packer.pack(item))
                    final[column] = stream.getvalue()
                final[b'f:t'] = str(timestamp)
                b.put(rk, final)

    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all
        parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue.

        :param max_n_requests: maximum number of requests
        :param partition_id: partition id to get batch from
        :param min_requests: minimum number of requests
        :param min_hosts: minimum number of hosts
        :param max_requests_per_host: maximum number of requests per host
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        min_requests = kwargs.pop('min_requests')
        min_hosts = kwargs.pop('min_hosts', None)
        max_requests_per_host = kwargs.pop('max_requests_per_host', None)
        assert(max_n_requests > min_requests)
        table = self.connection.table(self.table_name)

        meta_map = {}
        queue = {}
        limit = min_requests
        tries = 0
        count = 0
        prefix = to_bytes('%d_' % partition_id)
        # now_ts = int(time())
        # TODO: figure out how to use filter here, Thrift filter above causes full scan
        # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts)
        while tries < self.GET_RETRIES:
            tries += 1
            limit *= 5.5 if tries > 1 else 1.0
            self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d",
                              tries, limit, count, len(queue.keys()))
            meta_map.clear()
            queue.clear()
            count = 0
            for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix):  # filter=filter
                for cq, buf in six.iteritems(data):
                    if cq == b'f:t':
                        continue
                    stream = BytesIO(buf)
                    unpacker = Unpacker(stream)
                    for item in unpacker:
                        fprint, host_crc32, _, _ = item
                        if host_crc32 not in queue:
                            queue[host_crc32] = []
                        if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host:
                            continue
                        queue[host_crc32].append(fprint)
                        count += 1

                        if fprint not in meta_map:
                            meta_map[fprint] = []
                        meta_map[fprint].append((rk, item))
                if count > max_n_requests:
                    break

            if min_hosts is not None and len(queue.keys()) < min_hosts:
                continue

            if count < min_requests:
                continue
            break

        self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count)

        # For every fingerprint collect it's row keys and return all fingerprints from them
        fprint_map = {}
        for fprint, meta_list in six.iteritems(meta_map):
            for rk, _ in meta_list:
                fprint_map.setdefault(rk, []).append(fprint)

        results = []
        trash_can = set()

        for _, fprints in six.iteritems(queue):
            for fprint in fprints:
                for rk, _ in meta_map[fprint]:
                    if rk in trash_can:
                        continue
                    for rk_fprint in fprint_map[rk]:
                        _, item = meta_map[rk_fprint][0]
                        _, _, encoded, score = item
                        request = self.decoder.decode_request(encoded)
                        request.meta[b'score'] = score
                        results.append(request)
                    trash_can.add(rk)

        with table.batch(transaction=True) as b:
            for rk in trash_can:
                b.delete(rk)
        self.logger.debug("%d row keys removed", len(trash_can))
        return results

    def count(self):
        raise NotImplementedError
示例#10
0
class HBaseQueue(Queue):

    GET_RETRIES = 3

    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}})

        class DumbResponse:
            pass
        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        pass

    def schedule(self, batch):
        to_schedule = dict()
        now = int(time())
        for fprint, score, request, schedule in batch:
            if schedule:
                if b'domain' not in request.meta:    # TODO: this have to be done always by DomainMiddleware,
                    # so I propose to require DomainMiddleware by HBaseBackend and remove that code
                    _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                    if not hostname:
                        self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                    request.meta[b'domain'] = {'name': hostname}
                timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now
                to_schedule.setdefault(timestamp, []).append((request, score))
        for timestamp, batch in six.iteritems(to_schedule):
            self._schedule(batch, timestamp)

    def _schedule(self, batch, timestamp):
        """
        Row - portion of the queue for each partition id created at some point in time
        Row Key - partition id + score interval + random_str
        Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...)
        Value - QueueCell msgpack blob

        Where score is mapped from 0.0 to 1.0
        score intervals are
          [0.01-0.02)
          [0.02-0.03)
          [0.03-0.04)
         ...
          [0.99-1.00]
        random_str - the time when links was scheduled for retrieval, microsecs

        :param batch: iterable of Request objects
        :return:
        """
        def get_interval(score, resolution):
            if score < 0.0 or score > 1.0:
                raise OverflowError

            i = int(score / resolution)
            if i % 10 == 0 and i > 0:
                i = i - 1  # last interval is inclusive from right
            return (i * resolution, (i + 1) * resolution)

        random_str = int(time() * 1E+6)
        data = dict()
        for request, score in batch:
            domain = request.meta[b'domain']
            fingerprint = request.meta[b'fingerprint']
            if type(domain) == dict:
                partition_id = self.partitioner.partition(domain[b'name'], self.partitions)
                host_crc32 = get_crc32(domain[b'name'])
            elif type(domain) == int:
                partition_id = self.partitioner.partition_by_hash(domain, self.partitions)
                host_crc32 = domain
            else:
                raise TypeError("domain of unknown type.")
            item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score)
            score = 1 - score  # because of lexicographical sort in HBase
            rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str)
            data.setdefault(rk, []).append((score, item))

        table = self.connection.table(self.table_name)
        with table.batch(transaction=True) as b:
            for rk, tuples in six.iteritems(data):
                obj = dict()
                for score, item in tuples:
                    column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001)
                    obj.setdefault(column, []).append(item)

                final = dict()
                packer = Packer()
                for column, items in six.iteritems(obj):
                    stream = BytesIO()
                    for item in items:
                        stream.write(packer.pack(item))
                    final[column] = stream.getvalue()
                final[b'f:t'] = str(timestamp)
                b.put(rk, final)

    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all
        parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue.

        :param max_n_requests: maximum number of requests
        :param partition_id: partition id to get batch from
        :param min_requests: minimum number of requests
        :param min_hosts: minimum number of hosts
        :param max_requests_per_host: maximum number of requests per host
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        min_requests = kwargs.pop('min_requests')
        min_hosts = kwargs.pop('min_hosts')
        max_requests_per_host = kwargs.pop('max_requests_per_host')
        assert(max_n_requests > min_requests)
        table = self.connection.table(self.table_name)

        meta_map = {}
        queue = {}
        limit = min_requests
        tries = 0
        count = 0
        prefix = '%d_' % partition_id
        now_ts = int(time())
        filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts)
        while tries < self.GET_RETRIES:
            tries += 1
            limit *= 5.5 if tries > 1 else 1.0
            self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d",
                              tries, limit, count, len(queue.keys()))
            meta_map.clear()
            queue.clear()
            count = 0
            for rk, data in table.scan(limit=int(limit), batch_size=256, filter=filter):
                for cq, buf in six.iteritems(data):
                    if cq == b'f:t':
                        continue
                    stream = BytesIO(buf)
                    unpacker = Unpacker(stream)
                    for item in unpacker:
                        fprint, host_crc32, _, _ = item
                        if host_crc32 not in queue:
                            queue[host_crc32] = []
                        if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host:
                            continue
                        queue[host_crc32].append(fprint)
                        count += 1

                        if fprint not in meta_map:
                            meta_map[fprint] = []
                        meta_map[fprint].append((rk, item))
                if count > max_n_requests:
                    break

            if min_hosts is not None and len(queue.keys()) < min_hosts:
                continue

            if count < min_requests:
                continue
            break

        self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count)

        # For every fingerprint collect it's row keys and return all fingerprints from them
        fprint_map = {}
        for fprint, meta_list in six.iteritems(meta_map):
            for rk, _ in meta_list:
                fprint_map.setdefault(rk, []).append(fprint)

        results = []
        trash_can = set()

        for _, fprints in six.iteritems(queue):
            for fprint in fprints:
                for rk, _ in meta_map[fprint]:
                    if rk in trash_can:
                        continue
                    for rk_fprint in fprint_map[rk]:
                        _, item = meta_map[rk_fprint][0]
                        _, _, encoded, score = item
                        request = self.decoder.decode_request(encoded)
                        request.meta[b'score'] = score
                        results.append(request)
                    trash_can.add(rk)

        with table.batch(transaction=True) as b:
            for rk in trash_can:
                b.delete(rk)
        self.logger.debug("%d row keys removed", len(trash_can))
        return results

    def count(self):
        raise NotImplementedError
示例#11
0
文件: db.py 项目: lopuhin/frontera
class DBWorker(object):
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)

    def set_process_info(self, process_info):
        self.process_info = process_info

    def run(self):
        def debug(sig, frame):
            logger.critical("Signal received: printing stack trace")
            logger.critical(str("").join(format_stack(frame)))

        self.slot.schedule(on_start=True)
        self._logging_task.start(30)
        signal(SIGUSR1, debug)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()

    def stop(self):
        logger.info("Stopping frontier manager.")
        self._manager.stop()

    def log_status(self):
        for k, v in six.iteritems(self.stats):
            logger.info("%s=%s", k, v)

    def disable_new_batches(self):
        self.slot.disable_new_batches = True

    def enable_new_batches(self):
        self.slot.disable_new_batches = False

    def consume_incoming(self, *args, **kwargs):
        consumed = 0
        for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError) as e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                if type == 'add_seeds':
                    _, seeds = msg
                    logger.info('Adding %i seeds', len(seeds))
                    for seed in seeds:
                        logger.debug('URL: %s', seed.url)
                    self._backend.add_seeds(seeds)
                if type == 'page_crawled':
                    _, response, links = msg
                    logger.debug("Page crawled %s", response.url)
                    if 'jid' not in response.meta or response.meta['jid'] != self.job_id:
                        continue
                    self._backend.page_crawled(response, links)
                if type == 'request_error':
                    _, request, error = msg
                    if 'jid' not in request.meta or request.meta['jid'] != self.job_id:
                        continue
                    logger.debug("Request error %s", request.url)
                    self._backend.request_error(request, error)
                if type == 'offset':
                    _, partition_id, offset = msg
                    try:
                        producer_offset = self.spider_feed_producer.get_offset(partition_id)
                    except KeyError:
                        continue
                    else:
                        lag = producer_offset - offset
                        if lag < 0:
                            # non-sense in general, happens when SW is restarted and not synced yet with Spiders.
                            continue
                        if lag < self.max_next_requests or offset == 0:
                            self.spider_feed.mark_ready(partition_id)
                        else:
                            self.spider_feed.mark_busy(partition_id)
            finally:
                consumed += 1
        """
        # TODO: Think how it should be implemented in DB-worker only mode.
        if not self.strategy_enabled and self._backend.finished():
            logger.info("Crawling is finished.")
            reactor.stop()
        """
        self.stats['consumed_since_start'] += consumed
        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()
        self.slot.schedule()
        return consumed

    def consume_scoring(self, *args, **kwargs):
        consumed = 0
        seen = set()
        batch = []
        for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError) as e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                if msg[0] == 'update_score':
                    _, fprint, score, url, schedule = msg
                    if fprint not in seen:
                        batch.append((fprint, score, Request(url), schedule))
                    seen.add(fprint)
                if msg[0] == 'new_job_id':
                    self.job_id = msg[1]
            finally:
                consumed += 1
        self.queue.schedule(batch)

        self.stats['consumed_scoring_since_start'] += consumed
        self.stats['last_consumed_scoring'] = consumed
        self.stats['last_consumption_run_scoring'] = asctime()
        self.slot.schedule()

    def new_batch(self, *args, **kwargs):
        def get_hostname(request):
            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
            except Exception as e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'],
                                                                               request.url))
                return None
            else:
                return name.encode('utf-8', 'ignore')

        def get_fingerprint(request):
            return request.meta['fingerprint']

        partitions = self.spider_feed.available_partitions()
        logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions)))
        if not partitions:
            return 0

        count = 0
        if self.spider_feed_partitioning == 'hostname':
            get_key = get_hostname
        elif self.spider_feed_partitioning == 'fingerprint':
            get_key = get_fingerprint
        else:
            raise Exception("Unexpected value in self.spider_feed_partitioning")

        for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception as e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e,
                                                                               request.meta['fingerprint'],
                                                                               request.url))
                continue
            finally:
                count += 1
            self.spider_feed_producer.send(get_key(request), eo)

        self.stats['pushed_since_start'] += count
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count