def test_queue(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 2, b'queue', True) batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r3.url]) assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r1.url, r2.url])
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', True) r5 = r3.copy() r5.meta[b'crawl_at'] = int(time()) + 1 batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] sleep(1.5) assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r5.url])
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: mocked_time.return_value = time() assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] mocked_time.return_value = crawl_at + 1 assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r5.url])
def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' hbase_states_table = 'states' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': { 'max_versions': 1 }}) connection.create_table(hbase_states_table, {'f': {'max_versions': 1}}) tables = connection.tables() assert set(tables) == set([b'metadata', b'queue', b'states']) # Failure of test itself try: HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, use_snappy=False, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) HBaseState(connection, hbase_states_table, cache_size_limit=100, write_log_size=10, drop_all_tables=True) except AlreadyExists: assert False, "failed to drop hbase tables"
def test_queue_with_post_request(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', drop=True, use_snappy=False) batch = [('10', 0.5, r1, True)] queue.schedule(batch) requests = queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) self.assertEqual(b'POST', requests[0].method) self.assertEqual(data, requests[0].body)
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: mocked_time.return_value = time() assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] mocked_time.return_value = crawl_at + 1 assert set([ r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def queue(request): if request.param == "memory": mq = MemoryQueue(2) yield mq return if request.param == "sqlalchemy": engine = create_engine('sqlite:///:memory:', echo=False) session_cls = sessionmaker() session_cls.configure(bind=engine) QueueModel.__table__.create(bind=engine) sqla_queue = SQLAlchemyQueue(session_cls, QueueModel, 2) yield sqla_queue sqla_queue.frontier_stop() engine.dispose() return if request.param == "hbase": conn = get_hbase_connection() hq = HBaseQueue(conn, 2, b'queue') yield hq hq.frontier_stop() return raise KeyError("Unknown backend param")
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', True) r5 = r3.copy() r5.meta[b'crawl_at'] = int(time()) + 1 batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] sleep(1.5) assert set([ r.url for r in queue.get_next_requests( 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])