def test_shards_insufficient_data(): """THIS IS SUPER IMPORTANT FOR TESTING SHARDS IF THIS FAILED, DONT IGNORE IT, DEBUG IT """ index_docs = 3 parallel = 4 def validate(req): assert len(req.docs) == 1 assert len(req.docs[0].matches) == index_docs for d in req.docs[0].matches: assert hasattr(d, 'weight') assert d.weight assert d.meta_info == b'hello world' f = Flow().add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, separated_workspace=True) with f: f.index(input_fn=random_docs(index_docs), override_doc_id=False) time.sleep(2) with f: pass time.sleep(2) f = Flow().add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, separated_workspace=True, polling='all', uses_after='_merge_all') with f: f.search(input_fn=random_queries(1, index_docs), override_doc_id=False, callback_on='body') time.sleep(2) rm_files(['test-docshard-tmp'])
def test_load_flow_from_yaml(): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) with open(os.path.join(cur_dir, '../yaml/swarm-out.yml'), 'w') as fp, a: a.to_swarm_yaml(fp) rm_files([os.path.join(cur_dir, '../yaml/swarm-out.yml')])
def test_flow_yaml_dump(): f = Flow(optimize_level=FlowOptimizeLevel.IGNORE_GATEWAY, no_gateway=True) f.save_config('test1.yml') fl = Flow.load_config('test1.yml') assert f.args.optimize_level == fl.args.optimize_level rm_files(['test1.yml'])
def test_shards(): f = Flow().add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=3) with f: f.index(input_fn=random_docs(1000), random_doc_id=False) with f: pass rm_files(['test-docshard-tmp'])
def validate(ids, expect): for j in ids: fname = f'tmp{j}.txt' assert os.path.exists(fname) == expect if expect: with open(fname) as fp: assert fp.read() != '' rm_files([fname])
def test_load_flow_from_yaml(): with open(cur_dir.parent / 'yaml' / 'test-flow.yml') as fp: a = Flow.load_config(fp) with a: with open(str(cur_dir.parent / 'yaml' / 'swarm-out.yml'), 'w') as fp: a.to_swarm_yaml(fp) rm_files([str(cur_dir.parent / 'yaml' / 'swarm-out.yml')])
def test_compound_from_yaml(): a = BaseExecutor.load_config(str(cur_dir / 'yaml/npvec.yml')) assert isinstance(a, CompoundExecutor) assert callable(getattr(a, 'add')) assert callable(getattr(a, 'query')) assert callable(getattr(a, 'meta_add')) assert callable(getattr(a, 'meta_query')) rm_files([c.index_abspath for c in a.components]) rm_files(['test-workspace'])
def test_flow_with_jump(): def _validate(f): node = f._pod_nodes['gateway'] assert node.head_args.socket_in == SocketType.PULL_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r1'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUB_BIND node = f._pod_nodes['r2'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r3'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r4'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r5'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r6'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r8'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r9'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r10'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_BIND for name, node in f._pod_nodes.items(): assert node.peas_args['peas'][0] == node.head_args assert node.peas_args['peas'][0] == node.tail_args f = (Flow().add(name='r1') .add(name='r2') .add(name='r3', needs='r1') .add(name='r4', needs='r2') .add(name='r5', needs='r3') .add(name='r6', needs='r4') .add(name='r8', needs='r6') .add(name='r9', needs='r5') .add(name='r10', needs=['r9', 'r8'])) with f: _validate(f) f.save_config('tmp.yml') Flow.load_config('tmp.yml') with Flow.load_config('tmp.yml') as f: _validate(f) rm_files(['tmp.yml'])
def test_compositional_dump(): a = CompoundExecutor() a.components = lambda: [BaseExecutor(), BaseExecutor()] assert a.name a.touch() a.save() a.save_config() assert Path(a.save_abspath).exists() assert Path(a.config_abspath).exists() rm_files([a.save_abspath, a.config_abspath])
def test_compositional_dump(test_metas): a = CompoundExecutor(metas=test_metas) a.components = lambda: [BaseExecutor(), BaseExecutor()] assert a.name a.touch() a.save() a.save_config() assert os.path.exists(a.save_abspath) assert os.path.exists(a.config_abspath) rm_files([a.save_abspath, a.config_abspath])
def test_flow_yaml_dump(): f = Flow(logserver_config=os.path.join(cur_dir, '../yaml/test-server-config.yml'), optimize_level=FlowOptimizeLevel.IGNORE_GATEWAY, no_gateway=True) f.save_config('test1.yml') fl = Flow.load_config('test1.yml') assert f.args.logserver_config == fl.args.logserver_config assert f.args.optimize_level == fl.args.optimize_level rm_files(['test1.yml'])
def test_shards(): f = Flow().add(name='doc_pb', uses=str(cur_dir.parent / 'yaml' / 'test-docpb.yml'), parallel=3, separated_workspace=True) with f: f.index(input_fn=random_docs(1000), random_doc_id=False) with f: pass rm_files(['test-docshard-tmp'])
def test_index_text_files(): def validate(req): for d in req.docs: assert d.text f = (Flow(read_only=True).add(uses=os.path.join(cur_dir, '../yaml/datauriindex.yml'), timeout_ready=-1)) with f: f.index_files('*.py', output_fn=validate, callback_on='body') rm_files(['doc.gzip'])
def test_shards(restful): f = (Flow(restful=restful).add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=3, separated_workspace=True)) with f: f.index(input_fn=random_docs(1000), random_doc_id=False) with f: pass rm_files(['test-docshard-tmp'])
def test_transform_encoder_train_and_encode(): train_data = np.random.rand(2000, input_dim) encoder = TransformEncoder(output_dim=target_output_dim) from sklearn.random_projection import GaussianRandomProjection encoder.model = GaussianRandomProjection(n_components=target_output_dim) encoder.train(train_data) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray rm_files([encoder.save_abspath, encoder.config_abspath])
def test_transform_encoder_load_from_pickle(): train_data = np.random.rand(2000, input_dim) filename = 'transformer_model.model' from sklearn.random_projection import GaussianRandomProjection model = GaussianRandomProjection(n_components=target_output_dim) pickle.dump(model.fit(train_data), open(filename, 'wb')) encoder = TransformEncoder(model_path=filename) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) transformed_data = model.transform(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray np.testing.assert_almost_equal(encoded_data, transformed_data) rm_files([encoder.config_abspath, filename, encoder.save_abspath])
def test_flow_identical(): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) b = (Flow() .add(name='chunk_seg', parallel=3) .add(name='wqncode1', parallel=2) .add(name='encode2', parallel=2, needs='chunk_seg') .join(['wqncode1', 'encode2'])) a.save_config('test2.yml') c = Flow.load_config('test2.yml') assert a == b assert a == c with a as f: node = f._pod_nodes['gateway'] assert node.head_args.socket_in == SocketType.PULL_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['chunk_seg'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUB_BIND node = f._pod_nodes['wqncode1'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['encode2'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT rm_files(['test2.yml'])
def test_index_text_files(mocker, restful): def validate(req): assert len(req.docs) > 0 for d in req.docs: assert d.text response_mock = mocker.Mock(wrap=validate) f = (Flow(restful=restful, read_only=True) .add(uses=os.path.join(cur_dir, '../yaml/datauriindex.yml'), timeout_ready=-1)) with f: f.index_files('*.py', on_done=response_mock, callback_on='body') rm_files(['doc.gzip']) response_mock.assert_called()
def test_shards_insufficient_data(mocker, restful): """THIS IS SUPER IMPORTANT FOR TESTING SHARDS IF THIS FAILED, DONT IGNORE IT, DEBUG IT """ index_docs = 3 parallel = 4 mock = mocker.Mock() def validate(req): mock() assert len(req.docs) == 1 assert len(req.docs[0].matches) == index_docs for d in req.docs[0].matches: assert hasattr(d, 'weight') assert d.weight f = (Flow(restful=restful) .add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, separated_workspace=True)) with f: f.index(input_fn=random_docs(index_docs)) time.sleep(2) with f: pass time.sleep(2) f = (Flow(restful=restful) .add(name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, separated_workspace=True, polling='all', uses_after='_merge_chunks')) with f: f.search(input_fn=random_queries(1, index_docs), callback_on='body', on_done=validate) time.sleep(2) rm_files(['test-docshard-tmp']) mock.assert_called_once()
def test_cache_driver_twice(): docs = list(random_docs(10)) driver = MockCacheDriver() with DocIDCache(filename) as executor: assert not executor.handler_mutex driver.attach(executor=executor, pea=None) driver._traverse_apply(docs) with pytest.raises(NotImplementedError): # duplicate docs driver._traverse_apply(docs) # new docs docs = list(random_docs(10)) driver._traverse_apply(docs) # check persistence assert os.path.exists(filename) rm_files([filename])
def test_compositional_route(monkeypatch): monkeypatch.setattr(BaseExecutor, 'exec_methods', ['say']) da = DummyA() db = DummyB() a = CompoundExecutor() a.components = lambda: [da, db] assert a.say_all() == ['a', 'b'] with pytest.raises(AttributeError): a.say() b = CompoundExecutor({'say': {da.name: 'say'}}) b.components = lambda: [da, db] assert b.say_all() == ['a', 'b'] assert b.say() == 'a' b.add_route('say', db.name, 'say') assert b.say() == 'b' b.save_config() assert Path(b.config_abspath).exists() c = BaseExecutor.load_config(b.config_abspath) assert c.say_all() == ['a', 'b'] assert c.say() == 'a' b.add_route('say', db.name, 'say', is_stored=True) b.save_config() c = BaseExecutor.load_config(b.config_abspath) assert c.say_all() == ['a', 'b'] assert c.say() == 'b' b.touch() b.save() assert Path(b.save_abspath).exists() d = BaseExecutor.load(b.save_abspath) assert d.say_all() == ['a', 'b'] assert d.say() == 'b' rm_files([b.save_abspath, b.config_abspath])
def test_cache_driver_from_file(): docs = list(random_docs(10)) with open(filename, 'wb') as fp: fp.write( np.array([uid.id2hash(d.id) for d in docs], dtype=np.int64).tobytes()) driver = MockCacheDriver() with DocIDCache(filename) as executor: assert not executor.handler_mutex driver.attach(executor=executor, pea=None) with pytest.raises(NotImplementedError): # duplicate docs driver._traverse_apply(docs) # new docs docs = list(random_docs(10)) driver._traverse_apply(docs) # check persistence assert os.path.exists(filename) rm_files([filename])
def test_standard_query(): mem1 = used_memory(1) print(used_memory_readable()) with NumpyIndexer.load('a.bin') as ni: ni.batch_size = 256 print(used_memory_readable()) print(ni.raw_ndarray.shape) print(used_memory_readable()) with TimeContext('query topk') as ti: result = ni.query(queries, top_k=10) mem2 = used_memory(1) print(used_memory_readable()) print(result[0].shape) with open(summary_file, 'a') as fp: json.dump( { 'name': 'naive', 'memory': mem2 - mem1, 'readable': get_readable_size(mem2 - mem1), 'time': ti.duration }, fp) fp.write('\n') rm_files([ni.index_abspath, ni.save_abspath, 'a.bin', 'a.gz'])
def run_around_tests(): yield rm_files([ 'vec1.gz', 'vec2.gz', 'chunk1.gz', 'chunk2.gz', 'vecidx1.bin', 'vecidx2.bin', 'kvidx1.bin', 'kvidx2.bin' ])