def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':', 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': { 'max_versions': 1 }}) tables = connection.tables() assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself try: HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) except AlreadyExists: assert False, "failed to drop hbase tables"
def __init__(self, name): from happybase import Connection from thrift.transport import TTransport try: self._conn = Connection('localhost') self._table = self._conn.table(name) except TTransport.TTransportException, e: raise UserWarning(e)
def test_metadata(self): connection = Connection(host='hbase-docker', port=9090) metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) metadata.add_seeds([r1, r2, r3]) resp = Response('https://www.example.com', request=r1) metadata.page_crawled(resp) metadata.links_extracted(resp.request, [r2, r3]) metadata.request_error(r4, 'error') metadata.frontier_stop() table = connection.table('metadata') assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \ set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12'])
def setup_module(): global connection, table connection = Connection(**connection_kwargs) assert_is_not_none(connection) cfs = { 'cf1': {}, 'cf2': None, 'cf3': {'max_versions': 1}, } connection.create_table(TEST_TABLE_NAME, families=cfs) table = connection.table(TEST_TABLE_NAME) assert_is_not_none(table)
def test_prefix(): assert_equal(TABLE_PREFIX + '_', connection._table_name('')) assert_equal(TABLE_PREFIX + '_foo', connection._table_name('foo')) assert_equal(connection.table('foobar').name, TABLE_PREFIX + '_foobar') assert_equal(connection.table('foobar', use_prefix=False).name, 'foobar') c = Connection(autoconnect=False) assert_equal('foo', c._table_name('foo')) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix=123) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix_separator=2.1)
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table( 'domain_metadata', {'m': { 'max_versions': 1, 'block_cache_enabled': 1, }}) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') t.delete('d3') t.delete('d4')
def test_prefix(): assert TABLE_PREFIX + b'_' == connection._table_name('') assert TABLE_PREFIX + b'_foo' == connection._table_name('foo') assert connection.table('foobar').name == TABLE_PREFIX + b'_foobar' assert connection.table('foobar', use_prefix=False).name == b'foobar' c = Connection(autoconnect=False) assert b'foo' == c._table_name('foo') with assert_raises(TypeError): Connection(autoconnect=False, table_prefix=123) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix_separator=2.1)
def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: mocked_time.return_value = time() assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] mocked_time.return_value = crawl_at + 1 assert set([ r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def test_state(self): connection = Connection(host='hbase-docker', port=9090) state = HBaseState(connection, b'states', cache_size_limit=300000, write_log_size=5000, drop_all_tables=True) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3 state.update_cache([r1, r2, r3]) assert dict(state._state_cache) == { b'10': States.NOT_CRAWLED, b'11': States.NOT_CRAWLED, b'12': States.NOT_CRAWLED } assert state._state_batch._mutation_count == 3 r1.meta[b'state'] = States.CRAWLED r2.meta[b'state'] = States.CRAWLED r3.meta[b'state'] = States.CRAWLED state.update_cache([r1, r2, r3]) assert state._state_batch._mutation_count == 6 state.flush() assert state._state_batch._mutation_count == 0 state.fetch([b'10', b'11', b'12']) assert dict(state._state_cache) == { b'10': States.CRAWLED, b'11': States.CRAWLED, b'12': States.CRAWLED } r4.meta[b'state'] = States.ERROR state.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.CRAWLED state.flush() assert state._state_batch._mutation_count == 0
def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY' 's': {'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216)
def test_state(self): connection = Connection(host='hbase-docker', port=9090) state = HBaseState(connection, b'metadata', 300000) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3 state.update_cache([r1, r2, r3]) assert state._state_cache == { b'10': States.NOT_CRAWLED, b'11': States.NOT_CRAWLED, b'12': States.NOT_CRAWLED } r1.meta[b'state'] = States.CRAWLED r2.meta[b'state'] = States.CRAWLED r3.meta[b'state'] = States.CRAWLED state.update_cache([r1, r2, r3]) state.flush(True) assert state._state_cache == {} state.fetch([b'10', b'11', b'12']) assert state._state_cache == { b'10': States.CRAWLED, b'11': States.CRAWLED, b'12': States.CRAWLED } r4.meta[b'state'] = States.ERROR state.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.CRAWLED state.flush(True) assert state._state_cache == {}
def setup_module(): global connection, table connection = Connection(**connection_kwargs) assert connection is not None maybe_delete_table() cfs = { 'cf1': {}, 'cf2': None, 'cf3': {'max_versions': 1}, } connection.create_table(TEST_TABLE_NAME, families=cfs) table = connection.table(TEST_TABLE_NAME) assert table is not None
def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table( self._table_name, { 'm': { 'max_versions': 5 }, # 'compression': 'SNAPPY' 's': { 'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': { 'max_versions': 1 } }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216)
def kpi2(conn: happybase.Connection): """Correlation of rent price and family income per neighborhood.""" table = conn.table('housing') table2 = conn.table('opendatabcn') table3 = conn.table('idealista-to-open') for year in range(2014, 2017, 1): # key = district-neighborhood rfdByZone = dict() # value = RFD pricesByZone = dict() # value = [price] for _k, v in table.scan(): # Union by hand district = v[b'cf1:district'].decode('utf-8') neighborhood = v[b'cf1:neighborhood'].decode('utf-8') k = getKey(district, neighborhood) row = table3.row(k, columns=['cf1:district', 'cf1:neighborhood']) k = getKeyOpen(row[b'cf1:district'].decode('utf-8'), row[b'cf1:neighborhood'].decode('utf-8'), year=year) row = table2.row(k, columns=['cf1:rfd']) rfd = float(row[b'cf1:rfd'].decode('utf-8')) # Update data k = getKey(district.replace('-', ' '), neighborhood.replace('-', ' ')) rfdByZone[k] = rfd price = float(v[b'cf1:price'].decode('utf-8')) if k in pricesByZone: pricesByZone[k].append(price) else: pricesByZone[k] = [price] print('') print(f'Year {year}:') for k, rfd in rfdByZone.items(): (district, neighborhood) = k.split('-') price = mean(pricesByZone[k]) # Not the actual correlation formula but to simplify things correlation = price / rfd print( f'\t{neighborhood} has a correlation price/rfd = {correlation}' )
def test_queue(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 2, b'queue', True) batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r3.url]) assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r1.url, r2.url])
def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): super(HBaseCache, self).__init__(default_timeout) if not table_name: raise TypeError('table_name is a required argument') self.table_name = table_name self._c = Connection(host=host, port=port, table_prefix=table_prefix, **kwargs) self._table = self._c.table(table_name) self.clear()
def put_data_into_hbase(rdd): """ functions to store data into hbase table """ # collecting the results results = rdd.collect() # computing the exact time: this will serve as the row id date = str(datetime.datetime.now())[:19] # making connection to the right connection = Connection(host='localhost', port=9090, autoconnect=True) table = connection.table(name='base_tweets') # for data in results: if data[0] == 0: table.put(row=date, data={'tweet_count:neg': str(data[1])}) else: table.put(row=date, data={'tweet_count:pos': str(data[1])}) connection.close()
def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': {'max_versions': 1}}) tables = connection.tables() assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself try: HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) except AlreadyExists: assert False, "failed to drop hbase tables"
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table('domain_metadata', { 'm': {'max_versions': 1, 'block_cache_enabled': 1,} }) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') t.delete('d3') t.delete('d4')
def __init__(self, host="127.0.0.1", port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): # Potential bug: table_prefix instead of prefix BaseCache.__init__(self, default_timeout) if not table_name: raise TypeError("table_name is a required argument") self.table_name = table_name self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs) self._table = self._c.table(table_name) # Note: initialisation overwrites the existing rows of the Hbase table self.clear()
def __init__(self, manager): self.manager = manager self.logger = manager.logger.backend settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_COMPACT_PROTOCOL'): kwargs.update({'protocol': 'compact', 'transport': 'framed'}) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
def get_client_addons(self, client_id): """Retrieve the list of addons for the given client Only the last known version of the list of addons is retrieved""" with contextlib.closing(Connection(self._hostname)) as connection: table = connection.table(self.tablename) row_start = "{}:{}".format(client_id, "99999999") for key, data in table.scan(row_start=row_start, limit=1, columns=[self.column_family], reverse=True): return json.loads(data[self.column].decode("utf-8")) return None
def kpi1(conn: happybase.Connection): """Average number of new listings per day.""" table = conn.table('housing') def getDate(x): k, data = x return datetime.strptime(data[b'cf2:date'].decode('utf-8'), '%Y-%m-%d') dates = list(map(getDate, table.scan(columns=[b'cf2:date']))) nDays = (max(dates) - min(dates)).days nListings = len(dates) print('Average number of new listings per day {}'.format(nListings / nDays))
def test_queue_with_post_request(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', drop=True, use_snappy=False) batch = [('10', 0.5, r1, True)] queue.schedule(batch) requests = queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) self.assertEqual(b'POST', requests[0].method) self.assertEqual(data, requests[0].body)
class HBaseStorage(MachineBaseStorage): _VAL = b'values:value' _EXP = b'values:expires_at' _COLS = [_VAL, _EXP] def __init__(self, settings): super().__init__(settings) hbase_host = settings['HBASE_HOST'] hbase_table = settings['HBASE_TABLE'] self._connection = Connection(hbase_host) self._table = self._connection.table(hbase_table) def _get_value(self, key): row = self._table.row(key, self._COLS) val = row.get(self._VAL) if val: exp = row.get(self._EXP) if not exp: return val elif datetime.fromtimestamp( bytes_to_float(exp)) > datetime.utcnow(): return val else: self.delete(key) return None return None def has(self, key): val = self._get_value(key) return bool(val) def get(self, key): return self._get_value(key) def set(self, key, value, expires=None): data = {self._VAL: value} if expires: expires_at = datetime.utcnow() + timedelta(seconds=expires) data[self._EXP] = float_to_bytes(expires_at.timestamp()) self._table.put(key, data) def delete(self, key): self._table.delete(key) def size(self): return 0
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', True) r5 = r3.copy() r5.meta[b'crawl_at'] = int(time()) + 1 batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] sleep(1.5) assert set([ r.url for r in queue.get_next_requests( 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def get_client_profile(self, client_id): """Retrieve the latest row for the given client in HBase Only the last known version of the info is retrieved""" try: with contextlib.closing(Connection( self.hbase_hostname)) as connection: table = connection.table(self.tablename) client_row = table.row(client_id, columns=[self.column_family]) if client_row: return json.loads(client_row[self.column].decode("utf-8")) except Exception: logger.exception("Connection to HBase failed", extra={"client_id": client_id}) logger.info("Client information not found", extra={"client_id": client_id}) return None
def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get("HBASE_THRIFT_PORT") hosts = settings.get("HBASE_THRIFT_HOST") namespace = settings.get("HBASE_NAMESPACE") drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES") self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS") self._table_name = settings.get("HBASE_METADATA_TABLE") host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"} if settings.get("HBASE_USE_COMPACT_PROTOCOL"): kwargs.update({"protocol": "compact", "transport": "framed"}) self.connection = Connection(**kwargs) self.queue = HBaseQueue( self.connection, self.queue_partitions, self.manager.logger.backend, settings.get("HBASE_QUEUE_TABLE"), drop=drop_all_tables, ) self.state_checker = HBaseState( self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT") ) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = { "m": {"max_versions": 1}, "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True}, "c": {"max_versions": 1}, } if settings.get("HBASE_USE_SNAPPY"): schema["m"]["compression"] = "SNAPPY" schema["c"]["compression"] = "SNAPPY" self.connection.create_table(self._table_name, schema) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE")) self.store_content = settings.get("HBASE_STORE_CONTENT")
class TestDomainCache(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table( 'domain_metadata', {'m': { 'max_versions': 1, 'block_cache_enabled': 1, }}) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') t.delete('d3') t.delete('d4') def test_domain_cache_both_generations(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} # eviction should happen dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc['d1'] == {'domain': 1} assert dc['d2'] == {'domain': 2} assert dc['d3'] == {'domain': [3, 2, 1]} assert dc['d4'] == {'domain': 4} def test_domain_cache_get_with_default(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.get('d1', {}) == {'domain': 1} assert dc.get('d3', {}) == {'domain': [3, 2, 1]} def test_domain_cache_setdefault(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.setdefault('d1', {}) == {'domain': 1} assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6} dc.flush() assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]} def test_domain_cache_setdefault_with_second_gen_flush(self): dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3) dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} dc.setdefault('d1', {})['domain'] += 1 assert dc.setdefault('d1', {}) == {'domain': 2} def test_empty_key(self): dc = DomainCache(2, self.conn, 'domain_metadata') with self.assertRaises(KeyError): dc[''] = {'test': 1} def test_deletion(self): dc = DomainCache(2, self.conn, 'domain_metadata') with self.assertRaises(KeyError): del dc['d1'] dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} del dc['d1'] # second gen del dc['d3'] # first gen dc.flush() del dc['d4'] # hbase def test_contains(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert 'd1' in dc # second gen assert 'd3' in dc # first gen dc.flush() assert 'd4' in dc def test_pop(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.pop('d1') == {'domain': 1} assert 'd1' not in dc assert dc.pop('d3') == {'domain': [3, 2, 1]} assert 'd3' not in dc dc.flush() assert dc.pop('d4') == {'domain': 4} assert 'd4' not in dc
class HBaseCache(BaseCache): def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): super(HBaseCache, self).__init__(default_timeout) if not table_name: raise TypeError('table_name is a required argument') self.table_name = table_name self._c = Connection(host=host, port=port, table_prefix=table_prefix, **kwargs) self._table = self._c.table(table_name) self.clear() def _put(self, key, value): return key, {'cf:value': value} def _extract(self, value): if value: return value.get('cf:value') else: return value def add(self, key, value, timeout=None): table = self._table try: if not table.row(key): # TO-DO: what does table.row returns for non existing keys? table.put(*self._put(key, value)) else: return False except: return False return True def clear(self): self._c.delete_table(self.table_name, disable=True) self._c.create_table(self.table_name, {'cf': dict()}) return super(HBaseCache, self).clear() def dec(self, key, delta=1): return self.inc(key, -delta) # table = self._table # new_value = table.counter_inc(key, 'cf:value', -delta) # value = table.row(key) # new_value = (self._extract(value) or 0) - delta # table.put(*self._put(key, new_value)) # TO-DO the above should in principle be guarded by some exception handling # return new_value def delete(self, key): try: self._table.delete(key) except: return False return True def delete_many(self, *keys): batch = self._table.batch() try: for k in keys: batch.delete(k) batch.send() except: return False return True def get(self, key): value = self._table.row(key) return self._extract(value) def get_dict(self, *keys): table = self._table _, values = table.rows(keys) return {k: self._extract(v) for v in zip(keys, values)} def get_many(self, *keys): table = self._table _, values = table.rows(keys) return map(self._extract, values) def has(self, key): return super(HBaseCache, self).has(key) def inc(self, key, delta=1): table = self._table new_value = table.counter_inc(key, 'cf:value', delta) return new_value def set(self, key, value, timeout=None): table = self._table try: table.delete(key) # TO-DO Does this return an exception if it doesn't exist? Otherwise we need to put a table.row before that table.put(*self._put(key, value)) except: return False return True def set_many(self, mapping, timeout=None): batch = self._table.batch() for key, value in _items(mapping): batch.put(*self._put(key, value)) try: batch.send() except: return False return True
def test_timeout_arg(): Connection( timeout=5000, autoconnect=False)
class HBaseBackend(Backend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY' 's': {'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216) @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.connection.close() self.flush() def add_seeds(self, seeds): for seed in seeds: url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed) obj = prepare_hbase_object(url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain['fingerprint']) self.batch.put(unhexlify(fingerprint), obj) def page_crawled(self, response, links): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response) obj = prepare_hbase_object(status_code=response.status_code, content=response.body) links_dict = dict() for link in links: link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link) links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain) self.batch.put(unhexlify(fingerprint), obj) for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems(): obj = prepare_hbase_object(url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain['fingerprint']) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request) obj = prepare_hbase_object(url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain['fingerprint']) rk = unhexlify(request.meta['fingerprint']) self.batch.put(rk, obj) def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] log = self.manager.logger.backend log.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128) log.debug("Got %d items for partition id %d" % (len(results), partition_id)) for fingerprint, url, score in results: r = self.manager.request_model(url=url) r.meta['fingerprint'] = fingerprint r.meta['score'] = score next_pages.append(r) return next_pages def update_score(self, batch): if not isinstance(batch, dict): raise TypeError('batch should be dict with fingerprint as key, and float score as value') to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {'name': hostname}, url)) self.queue.schedule(to_schedule) def flush(self): self.batch.send() def update_states(self, objs, persist): self.state_checker.update(objs, persist) def flush_states(self, is_clear=True): self.state_checker.flush(is_clear) def fetch_states(self, fingerprints): self.state_checker.fetch(fingerprints)
class HBaseBackend(Backend): component_name = "HBase Backend" def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get("HBASE_THRIFT_PORT") hosts = settings.get("HBASE_THRIFT_HOST") namespace = settings.get("HBASE_NAMESPACE") drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES") self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS") self._table_name = settings.get("HBASE_METADATA_TABLE") host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"} if settings.get("HBASE_USE_COMPACT_PROTOCOL"): kwargs.update({"protocol": "compact", "transport": "framed"}) self.connection = Connection(**kwargs) self.queue = HBaseQueue( self.connection, self.queue_partitions, self.manager.logger.backend, settings.get("HBASE_QUEUE_TABLE"), drop=drop_all_tables, ) self.state_checker = HBaseState( self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT") ) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = { "m": {"max_versions": 1}, "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True}, "c": {"max_versions": 1}, } if settings.get("HBASE_USE_SNAPPY"): schema["m"]["compression"] = "SNAPPY" schema["c"]["compression"] = "SNAPPY" self.connection.create_table(self._table_name, schema) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE")) self.store_content = settings.get("HBASE_STORE_CONTENT") @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.connection.close() self.flush() def add_seeds(self, seeds): for seed in seeds: url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed) obj = prepare_hbase_object( url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain["fingerprint"] ) self.batch.put(unhexlify(fingerprint), obj) def page_crawled(self, response, links): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response) obj = ( prepare_hbase_object(status_code=response.status_code, content=response.body) if self.store_content else prepare_hbase_object(status_code=response.status_code) ) links_dict = dict() for link in links: link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link) links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain) self.batch.put(unhexlify(fingerprint), obj) for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems(): obj = prepare_hbase_object( url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain["fingerprint"] ) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request) obj = prepare_hbase_object( url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain["fingerprint"] ) rk = unhexlify(request.meta["fingerprint"]) self.batch.put(rk, obj) def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] log = self.manager.logger.backend log.debug("Querying queue table.") partitions = set(kwargs.pop("partitions", [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128) log.debug("Got %d items for partition id %d" % (len(results), partition_id)) for fingerprint, url, score in results: r = self.manager.request_model(url=url) r.meta["fingerprint"] = fingerprint r.meta["score"] = score next_pages.append(r) return next_pages def update_score(self, batch): if not isinstance(batch, dict): raise TypeError("batch should be dict with fingerprint as key, and float score as value") to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {"name": hostname}, url)) self.queue.schedule(to_schedule) def flush(self): self.batch.send() def update_states(self, objs, persist): self.state_checker.update(objs, persist) def flush_states(self, is_clear=True): self.state_checker.flush(is_clear) def fetch_states(self, fingerprints): self.state_checker.fetch(fingerprints)
def describe_proximity(window_seconds=5): """ Poll the DB at a given interval to get the minimum proximity, average proximity and the variance in the mean (to get a feel for range sampled). Args: interval (int): Inteval at which to update window_ms (int): Window range for averaging (in milliseconds) """ dt = timedelta(seconds=window_seconds) now_ = datetime.now() start = (now_ - dt).strftime(dtfmt) stop = now_.strftime(dtfmt) conn = Connection(config['hbase'], port=int(config['thrift'])) tab = conn.table(str.encode(config['prox_table'])) #dct = {k: v for k, v in tab.scan(row_start=pk01+start, row_stop=pk01+stop)} avg_ = [] min_ = 0 for pk in pks: dct = { k: v for k, v in tab.scan(row_start=pk + start, row_stop=pk + stop) } if len(dct) > 0: df = pd.DataFrame.from_dict(dct, orient="index").reset_index() df[b'spatial:dr'] = df[b'spatial:dr'].astype(float) avg_.append(df[b'spatial:dr'].mean()) min_ += df[df[b'spatial:dr'] < 10].shape[0] time.append(str(now_)) miny.append(min_) try: avgy.append(sum(avg_) / len(avg_)) except Exception: avgy.append(np.nan) avgline = Scatter(x=list(time), y=list(avgy), type='scatter', mode='lines', name='Mean') minline = Scatter(x=list(time), y=list(miny), type='scatter', mode='lines', name='< 10', yaxis="y2") #trace = [{'x': time, 'y': avgy, 'type': "scatter", 'mode': "lines", 'name': 'Avg'}, # {'x': time, 'y': miny, 'type': "scatter", 'mode': "lines", 'name': 'Min'}] layout = { 'height': 620, 'yaxis': { 'title': "Average Proximity (m)", 'side': "left" }, 'yaxis2': { 'title': 'Within 10 Meters (count)', 'side': "right", 'overlaying': "y" } } return Figure(data=[avgline, minline], layout=layout)
class HBaseCache(BaseCache): def __init__(self, host="127.0.0.1", port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): # Potential bug: table_prefix instead of prefix BaseCache.__init__(self, default_timeout) if not table_name: raise TypeError("table_name is a required argument") self.table_name = table_name self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs) self._table = self._c.table(table_name) # Note: initialisation overwrites the existing rows of the Hbase table self.clear() def _put(self, key, value): return key, {"cf:value": value} def _extract(self, value): if value: return value.get("cf:value") else: return value def add(self, key, value, timeout=None): # Note: timeout is not used in this method, but should be print "Adding stuff" table = self._table print table try: if not table.row( key ): # TO-DO: what does table.row returns for non existing keys? # Returns empty dict >> check for it and return None table.put(*self._put(key, value)) else: return False except: return False return True def clear(self): print "Clearing stuff" try: self._c.delete_table(self.table_name, disable=True) except: pass self._c.create_table(self.table_name, {"cf": dict()}) return super(HBaseCache, self).clear() def dec(self, key, delta=1): return self.inc(key, -delta) # table = self._table # new_value = table.counter_inc(key, 'cf:value', -delta) # value = table.row(key) # new_value = (self._extract(value) or 0) - delta # table.put(*self._put(key, new_value)) # TO-DO the above should in principle be guarded by some exception handling # return new_value def delete(self, key): try: self._table.delete(key) except: return False return True def delete_many(self, *keys): batch = self._table.batch() try: for k in keys: batch.delete(k) batch.send() except: return False return True def get(self, key): value = self._table.row(key) return self._extract(value) or None def get_dict(self, *keys): table = self._table rows = table.rows(keys) if not rows: return {k: None for k in keys} return {k: self._extract(v) for k, v in rows} def get_many(self, *keys): table = self._table rows = table.rows(keys) if not rows: return [None for _ in keys] return map(self._extract, map(itemgetter(1), rows)) def has(self, key): return super(HBaseCache, self).has(key) def inc(self, key, delta=1): table = self._table new_value = table.counter_inc(key, "cf:value", delta) return new_value def set(self, key, value, timeout=None): table = self._table print "Setting stuff" print table try: table.delete( key ) # TO-DO Does this return an exception if it doesn't exist? Otherwise we need to put a table.row before that table.put(*self._put(key, value)) except: return False return True def set_many(self, mapping, timeout=None): print "Set many" batch = self._table.batch() for key, value in _items(mapping): batch.put(*self._put(key, value)) try: batch.send() except: return False return True
class HBaseCache(BaseCache): def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): super(HBaseCache, self).__init__(default_timeout) if not table_name: raise TypeError('table_name is a required argument') self.table_name = table_name self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs) self._table = self._c.table(table_name) # Note: initialisation overwrites the existing rows of the Hbase table self.clear() def _put(self, key, value, timeout): timestamp = (datetime.now() + timedelta(0, timeout or self.default_timeout)).isoformat() return key, {'cf:value': value, 'cf:timestamp': timestamp} def _extract(self, value): if value: v = value.get('cf:value') ts = from_iso(value.get('cf:timestamp')) if ts > datetime.now(): return v else: return None else: return None def add(self, key, value, timeout=None): table = self._table try: if not table.row(key): table.put(*self._put(key, value, timeout)) else: return False except: return False return True def clear(self): try: self._c.delete_table(self.table_name, disable=True) except: pass self._c.create_table(self.table_name, {'cf': dict()}) return super(HBaseCache, self).clear() def dec(self, key, delta=1): return self.inc(key, -delta) def delete(self, key): # delete in happybase just uses batch() return self.delete_many([key]) def delete_many(self, *keys): with self._table.batch() as batch: # TO-DO: exceptions here? for k in keys: batch.delete() return True def get(self, key): value = self._table.row(key) return self._extract(value) or None def get_dict(self, *keys): keys = keys[0] table = self._table results = dict(table.rows(keys)) return {k: self._extract(results.get(k, None)) for k in keys} # Non-existing keys are not returned by table.rows() def get_many(self, *keys): result = self.get_dict(*keys) return [result[k] for k in keys[0]] def has(self, key): return super(HBaseCache, self).has(key) def inc(self, key, delta=1): table = self._table new_value = table.counter_inc(key, 'cf:value', delta) return new_value # TO-DO: rewrite this to use set_many. Check if delete is necessary, etc. def set(self, key, value, timeout=None): # set in happybase just uses batch table = self._table try: table.delete(key) table.put(*self._put(key, value, timeout)) except: return False return True def set_many(self, mapping, timeout=None): batch = self._table.batch() for key, value in _items(mapping): batch.put(*self._put(key, value, timeout)) try: batch.send() except: return False return True
class HBaseBackend(DistributedBackend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':', 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None @classmethod def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'), settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES')) return o @classmethod def db_worker(cls, manager): o = cls(manager) settings = manager.settings drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') o._queue = HBaseQueue(o.connection, o.queue_partitions, settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables, use_snappy=settings.get('HBASE_USE_SNAPPY')) o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) return o @property def metadata(self): return self._metadata @property def queue(self): return self._queue @property def states(self): return self._states def frontier_start(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_start() def frontier_stop(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_stop() self.connection.close() def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def page_crawled(self, response): self.metadata.page_crawled(response) def links_extracted(self, request, links): self.metadata.links_extracted(request, links) def request_error(self, page, error): self.metadata.request_error(page, error) def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self.logger.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get_next_requests(max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages
def test_connection_compat(): with assert_raises(ValueError): Connection(compat='0.1.invalid.version')
def __init__(self): self.conn = Connection()
class TestDomainCache(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table('domain_metadata', { 'm': {'max_versions': 1, 'block_cache_enabled': 1,} }) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') t.delete('d3') t.delete('d4') def test_domain_cache_both_generations(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} # eviction should happen dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc['d1'] == {'domain': 1} assert dc['d2'] == {'domain': 2} assert dc['d3'] == {'domain': [3, 2, 1]} assert dc['d4'] == {'domain': 4} def test_domain_cache_get_with_default(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.get('d1', {}) == {'domain': 1} assert dc.get('d3', {}) == {'domain': [3, 2, 1]} def test_domain_cache_setdefault(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.setdefault('d1', {}) == {'domain': 1} assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6} dc.flush() assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]} def test_domain_cache_setdefault_with_second_gen_flush(self): dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3) dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} dc.setdefault('d1', {})['domain'] += 1 assert dc.setdefault('d1', {}) == {'domain': 2} def test_empty_key(self): dc = DomainCache(2, self.conn, 'domain_metadata') with self.assertRaises(KeyError): dc[''] = {'test':1} def test_deletion(self): dc = DomainCache(2, self.conn, 'domain_metadata') with self.assertRaises(KeyError): del dc['d1'] dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} del dc['d1'] # second gen del dc['d3'] # first gen dc.flush() del dc['d4'] # hbase def test_contains(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert 'd1' in dc # second gen assert 'd3' in dc # first gen dc.flush() assert 'd4' in dc def test_pop(self): dc = DomainCache(2, self.conn, 'domain_metadata') dc['d1'] = {'domain': 1} dc['d2'] = {'domain': 2} dc['d3'] = {'domain': [3, 2, 1]} dc['d4'] = {'domain': 4} assert dc.pop('d1') == {'domain': 1} assert 'd1' not in dc assert dc.pop('d3') == {'domain': [3, 2, 1]} assert 'd3' not in dc dc.flush() assert dc.pop('d4') == {'domain': 4} assert 'd4' not in dc
def get_hbase_connection(): global hbase_connection if hbase_connection is None: hbase_connection = Connection(host='hbase-docker', port=9090) return hbase_connection
class HBaseBackend(DistributedBackend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None @classmethod def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'), settings.get('HBASE_STATE_CACHE_SIZE_LIMIT')) return o @classmethod def db_worker(cls, manager): o = cls(manager) settings = manager.settings drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') o._queue = HBaseQueue(o.connection, o.queue_partitions, settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables) o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) return o @property def metadata(self): return self._metadata @property def queue(self): return self._queue @property def states(self): return self._states def frontier_start(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_start() def frontier_stop(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_stop() self.connection.close() def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def page_crawled(self, response): self.metadata.page_crawled(response) def links_extracted(self, request, links): self.metadata.links_extracted(request, links) def request_error(self, page, error): self.metadata.request_error(page, error) def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self.logger.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get_next_requests(max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages