def put_data_into_hbase(rdd):
    """
    functions to store data into hbase table
    """
    # collecting the results
    results = rdd.collect()
    # computing the exact time: this will serve as the row id
    date = str(datetime.datetime.now())[:19]
    # making connection to the right
    connection = Connection(host='localhost', port=9090, autoconnect=True)
    table = connection.table(name='base_tweets')
    #
    for data in results:
        if data[0] == 0:
            table.put(row=date, data={'tweet_count:neg': str(data[1])})
        else:
            table.put(row=date, data={'tweet_count:pos': str(data[1])})

    connection.close()
예제 #2
0
파일: hbase.py 프로젝트: CN-hanyi/frontera
class HBaseBackend(DistributedBackend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':',
            'timeout': 60000
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.logger.info("Connecting to %s:%d thrift server.", host, port)
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None

    @classmethod
    def strategy_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'),
                               settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES'))
        return o

    @classmethod
    def db_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES')
        o._queue = HBaseQueue(o.connection, o.queue_partitions,
                              settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables,
                              use_snappy=settings.get('HBASE_USE_SNAPPY'))
        o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables,
                                    settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'),
                                    settings.get('STORE_CONTENT'))
        return o

    @property
    def metadata(self):
        return self._metadata

    @property
    def queue(self):
        return self._queue

    @property
    def states(self):
        return self._states

    def frontier_start(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_start()

    def frontier_stop(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_stop()
        self.connection.close()

    def add_seeds(self, seeds):
        self.metadata.add_seeds(seeds)

    def page_crawled(self, response):
        self.metadata.page_crawled(response)

    def links_extracted(self, request, links):
        self.metadata.links_extracted(request, links)

    def request_error(self, page, error):
        self.metadata.request_error(page, error)

    def finished(self):
        raise NotImplementedError

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        self.logger.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get_next_requests(max_next_requests, partition_id,
                                                   min_requests=self._min_requests,
                                                   min_hosts=self._min_hosts,
                                                   max_requests_per_host=self._max_requests_per_host)
            next_pages.extend(results)
            self.logger.debug("Got %d requests for partition id %d", len(results), partition_id)
        return next_pages
예제 #3
0
class HBaseBackend(DistributedBackend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':'
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None

    @classmethod
    def strategy_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'),
                               settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'))
        return o

    @classmethod
    def db_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES')
        o._queue = HBaseQueue(o.connection, o.queue_partitions,
                              settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables)
        o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables,
                                    settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'),
                                    settings.get('STORE_CONTENT'))
        return o

    @property
    def metadata(self):
        return self._metadata

    @property
    def queue(self):
        return self._queue

    @property
    def states(self):
        return self._states

    def frontier_start(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_start()

    def frontier_stop(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_stop()
        self.connection.close()

    def add_seeds(self, seeds):
        self.metadata.add_seeds(seeds)

    def page_crawled(self, response):
        self.metadata.page_crawled(response)

    def links_extracted(self, request, links):
        self.metadata.links_extracted(request, links)

    def request_error(self, page, error):
        self.metadata.request_error(page, error)

    def finished(self):
        raise NotImplementedError

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        self.logger.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get_next_requests(max_next_requests, partition_id,
                                                   min_requests=self._min_requests,
                                                   min_hosts=self._min_hosts,
                                                   max_requests_per_host=self._max_requests_per_host)
            next_pages.extend(results)
            self.logger.debug("Got %d requests for partition id %d", len(results), partition_id)
        return next_pages
예제 #4
0
class HBaseBackend(Backend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)


        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY'
                                                            's': {'max_versions': 1, 'block_cache_enabled': 1,
                                                            'bloom_filter_type': 'ROW', 'in_memory': True, },
                                                            'c': {'max_versions': 1}
                                                            })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.connection.close()
        self.flush()

    def add_seeds(self, seeds):
        for seed in seeds:
            url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
            obj = prepare_hbase_object(url=url,
                                       depth=0,
                                       created_at=utcnow_timestamp(),
                                       domain_fingerprint=domain['fingerprint'])
            self.batch.put(unhexlify(fingerprint), obj)

    def page_crawled(self, response, links):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
        obj = prepare_hbase_object(status_code=response.status_code, content=response.body)

        links_dict = dict()
        for link in links:
            link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
            links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)


        self.batch.put(unhexlify(fingerprint), obj)
        for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
            obj = prepare_hbase_object(url=link_url,
                                       created_at=utcnow_timestamp(),
                                       domain_fingerprint=link_domain['fingerprint'])
            self.batch.put(link_fingerprint, obj)

    def request_error(self, request, error):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
        obj = prepare_hbase_object(url=request.url,
                                   created_at=utcnow_timestamp(),
                                   error=error,
                                   domain_fingerprint=domain['fingerprint'])
        rk = unhexlify(request.meta['fingerprint'])
        self.batch.put(rk, obj)

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        log = self.manager.logger.backend
        log.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get(partition_id, max_next_requests,
                                                    min_hosts=24, max_requests_per_host=128)

            log.debug("Got %d items for partition id %d" % (len(results), partition_id))
            for fingerprint, url, score in results:
                r = self.manager.request_model(url=url)
                r.meta['fingerprint'] = fingerprint
                r.meta['score'] = score
                next_pages.append(r)
        return next_pages

    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError('batch should be dict with fingerprint as key, and float score as value')

        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint))
                    continue
                to_schedule.append((score, fprint, {'name': hostname}, url))
        self.queue.schedule(to_schedule)

    def flush(self):
        self.batch.send()

    def update_states(self, objs, persist):
        self.state_checker.update(objs, persist)

    def flush_states(self, is_clear=True):
        self.state_checker.flush(is_clear)

    def fetch_states(self, fingerprints):
        self.state_checker.fetch(fingerprints)
예제 #5
0
class HBaseBackend(Backend):
    component_name = "HBase Backend"

    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get("HBASE_THRIFT_PORT")
        hosts = settings.get("HBASE_THRIFT_HOST")
        namespace = settings.get("HBASE_NAMESPACE")
        drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES")
        self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS")
        self._table_name = settings.get("HBASE_METADATA_TABLE")
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"}
        if settings.get("HBASE_USE_COMPACT_PROTOCOL"):
            kwargs.update({"protocol": "compact", "transport": "framed"})
        self.connection = Connection(**kwargs)
        self.queue = HBaseQueue(
            self.connection,
            self.queue_partitions,
            self.manager.logger.backend,
            settings.get("HBASE_QUEUE_TABLE"),
            drop=drop_all_tables,
        )
        self.state_checker = HBaseState(
            self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT")
        )
        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {
                "m": {"max_versions": 1},
                "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True},
                "c": {"max_versions": 1},
            }
            if settings.get("HBASE_USE_SNAPPY"):
                schema["m"]["compression"] = "SNAPPY"
                schema["c"]["compression"] = "SNAPPY"
            self.connection.create_table(self._table_name, schema)
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE"))
        self.store_content = settings.get("HBASE_STORE_CONTENT")

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.connection.close()
        self.flush()

    def add_seeds(self, seeds):
        for seed in seeds:
            url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
            obj = prepare_hbase_object(
                url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain["fingerprint"]
            )
            self.batch.put(unhexlify(fingerprint), obj)

    def page_crawled(self, response, links):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
        obj = (
            prepare_hbase_object(status_code=response.status_code, content=response.body)
            if self.store_content
            else prepare_hbase_object(status_code=response.status_code)
        )
        links_dict = dict()
        for link in links:
            link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
            links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)
        self.batch.put(unhexlify(fingerprint), obj)
        for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
            obj = prepare_hbase_object(
                url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain["fingerprint"]
            )
            self.batch.put(link_fingerprint, obj)

    def request_error(self, request, error):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
        obj = prepare_hbase_object(
            url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain["fingerprint"]
        )
        rk = unhexlify(request.meta["fingerprint"])
        self.batch.put(rk, obj)

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        log = self.manager.logger.backend
        log.debug("Querying queue table.")
        partitions = set(kwargs.pop("partitions", []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128)
            log.debug("Got %d items for partition id %d" % (len(results), partition_id))
            for fingerprint, url, score in results:
                r = self.manager.request_model(url=url)
                r.meta["fingerprint"] = fingerprint
                r.meta["score"] = score
                next_pages.append(r)
        return next_pages

    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError("batch should be dict with fingerprint as key, and float score as value")
        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint))
                    continue
                to_schedule.append((score, fprint, {"name": hostname}, url))
        self.queue.schedule(to_schedule)

    def flush(self):
        self.batch.send()

    def update_states(self, objs, persist):
        self.state_checker.update(objs, persist)

    def flush_states(self, is_clear=True):
        self.state_checker.flush(is_clear)

    def fetch_states(self, fingerprints):
        self.state_checker.fetch(fingerprints)
예제 #6
0
class HBaseBackend(Backend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host,
                                     port=int(port),
                                     table_prefix=namespace,
                                     table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection,
                                self.queue_partitions,
                                self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)

        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(
                self._table_name,
                {
                    'm': {
                        'max_versions': 5
                    },  # 'compression': 'SNAPPY'
                    's': {
                        'max_versions': 1,
                        'block_cache_enabled': 1,
                        'bloom_filter_type': 'ROW',
                        'in_memory': True,
                    },
                    'c': {
                        'max_versions': 1
                    }
                })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.connection.close()
        self.flush()

    def add_seeds(self, seeds):
        for seed in seeds:
            url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(
                seed)
            obj = prepare_hbase_object(
                url=url,
                depth=0,
                created_at=utcnow_timestamp(),
                domain_fingerprint=domain['fingerprint'])
            self.batch.put(unhexlify(fingerprint), obj)

    def page_crawled(self, response, links):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(
            response)
        obj = prepare_hbase_object(status_code=response.status_code,
                                   content=response.body)

        links_dict = dict()
        for link in links:
            link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(
                link)
            links_dict[unhexlify(link_fingerprint)] = (link, link_url,
                                                       link_domain)

        self.batch.put(unhexlify(fingerprint), obj)
        for link_fingerprint, (link, link_url,
                               link_domain) in links_dict.iteritems():
            obj = prepare_hbase_object(
                url=link_url,
                created_at=utcnow_timestamp(),
                domain_fingerprint=link_domain['fingerprint'])
            self.batch.put(link_fingerprint, obj)

    def request_error(self, request, error):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(
            request)
        obj = prepare_hbase_object(url=request.url,
                                   created_at=utcnow_timestamp(),
                                   error=error,
                                   domain_fingerprint=domain['fingerprint'])
        rk = unhexlify(request.meta['fingerprint'])
        self.batch.put(rk, obj)

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        log = self.manager.logger.backend
        log.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get(partition_id,
                                     max_next_requests,
                                     min_hosts=24,
                                     max_requests_per_host=128)

            log.debug("Got %d items for partition id %d" %
                      (len(results), partition_id))
            for fingerprint, url, score in results:
                r = self.manager.request_model(url=url)
                r.meta['fingerprint'] = fingerprint
                r.meta['score'] = score
                next_pages.append(r)
        return next_pages

    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError(
                'batch should be dict with fingerprint as key, and float score as value'
            )

        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error(
                        "Can't get hostname for URL %s, fingerprint %s" %
                        (url, fprint))
                    continue
                to_schedule.append((score, fprint, {'name': hostname}, url))
        self.queue.schedule(to_schedule)

    def flush(self):
        self.batch.send()

    def update_states(self, objs, persist):
        self.state_checker.update(objs, persist)

    def flush_states(self, is_clear=True):
        self.state_checker.flush(is_clear)

    def fetch_states(self, fingerprints):
        self.state_checker.fetch(fingerprints)
예제 #7
0
    # flipper
    data_for_flipper = {
        'id:name': 'flipper',
        'features:race': 'dolphin',
        'features:gender': 'male',
        'features:apnea': '10'
    }

    # lassie
    data_for_lassie = {
        'id:chip_number': '314',
        'id:name': 'lassie',
        'features:race': 'colley',
        'features:gender': 'female'
    }

    # gary
    data_for_gary = {'id:name': 'gary', 'features:race': 'snail'}

    # putting data into the table
    table.put(row='1', data=data_for_lassie)
    table.put(row='2', data=data_for_flipper)
    table.put(row='3', data=data_for_gary)

    # printing out the content of the table
    for data in table.scan():
        pprint.pprint(data)

    # closing hbase connection
    hbase_connection.close()
예제 #8
0
class HBaseBackend(DistributedBackend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':',
            'timeout': 60000
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({'protocol': 'compact', 'transport': 'framed'})
        self.logger.info("Connecting to %s:%d thrift server.", host, port)
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None
        self._domain_metadata = None

    def _init_states(self, settings):
        self._states = HBaseState(
            connection=self.connection,
            table_name=settings.get('HBASE_STATES_TABLE'),
            cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'),
            write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'),
            drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES'))

    def _init_queue(self, settings):
        self._queue = HBaseQueue(self.connection,
                                 self.queue_partitions,
                                 settings.get('HBASE_QUEUE_TABLE'),
                                 drop=settings.get('HBASE_DROP_ALL_TABLES'),
                                 use_snappy=settings.get('HBASE_USE_SNAPPY'))

    def _init_metadata(self, settings):
        self._metadata = HBaseMetadata(self.connection,
                                       settings.get('HBASE_METADATA_TABLE'),
                                       settings.get('HBASE_DROP_ALL_TABLES'),
                                       settings.get('HBASE_USE_SNAPPY'),
                                       settings.get('HBASE_BATCH_SIZE'),
                                       settings.get('STORE_CONTENT'))

    def _init_domain_metadata(self, settings):
        self._domain_metadata = DomainCache(
            settings.get('HBASE_DOMAIN_METADATA_CACHE_SIZE'),
            self.connection,
            settings.get('HBASE_DOMAIN_METADATA_TABLE'),
            batch_size=settings.get('HBASE_DOMAIN_METADATA_BATCH_SIZE'))

    @classmethod
    def strategy_worker(cls, manager):
        o = cls(manager)
        o._init_states(manager.settings)
        o._init_domain_metadata(manager.settings)
        return o

    @classmethod
    def db_worker(cls, manager):
        o = cls(manager)
        o._init_queue(manager.settings)
        o._init_metadata(manager.settings)
        return o

    @classmethod
    def local(cls, manager):
        o = cls(manager)
        o._init_queue(manager.settings)
        o._init_states(manager.settings)
        return o

    @property
    def metadata(self):
        return self._metadata

    @property
    def queue(self):
        return self._queue

    @property
    def states(self):
        return self._states

    @property
    def domain_metadata(self):
        return self._domain_metadata

    def frontier_start(self):
        for component in [
                self.metadata, self.queue, self.states, self.domain_metadata
        ]:
            if component:
                component.frontier_start()

    def frontier_stop(self):
        for component in [
                self.metadata, self.queue, self.states, self.domain_metadata
        ]:
            if component:
                component.frontier_stop()
        self.connection.close()

    def add_seeds(self, seeds):
        self.metadata.add_seeds(seeds)

    def page_crawled(self, response):
        self.metadata.page_crawled(response)

    def links_extracted(self, request, links):
        self.metadata.links_extracted(request, links)

    def request_error(self, page, error):
        self.metadata.request_error(page, error)

    def finished(self):
        raise NotImplementedError

    def get_next_requests(self, max_next_requests, **kwargs):
        self.logger.debug("Querying queue table.")
        results = []
        for partition_id in set(
                kwargs.pop('partitions',
                           [i for i in range(self.queue_partitions)])):
            requests = self.queue.get_next_requests(
                max_next_requests,
                partition_id,
                min_requests=self._min_requests,
                min_hosts=self._min_hosts,
                max_requests_per_host=self._max_requests_per_host)
            results.extend(requests)
            self.logger.debug("Got %d requests for partition id %d",
                              len(requests), partition_id)
        return results

    def get_stats(self):
        """Helper to get stats dictionary for the backend.

        For now it provides only HBase client stats.
        """
        stats = {}
        with time_elapsed('Call HBase backend get_stats()'):
            stats.update(self.connection.client.get_stats())
        if self._states:
            stats.update(self._states.get_stats())
        return stats