def test_cache_content_driver_same_content(tmpdir): doc1 = Document(id=1) doc1.text = 'blabla' doc1.update_content_hash() docs1 = DocumentSet([doc1]) doc2 = Document(id=2) doc2.text = 'blabla' doc2.update_content_hash() docs2 = DocumentSet([doc2]) assert doc1.content_hash == doc2.content_hash driver = MockBaseCacheDriver() filename = None with DocIDCache(tmpdir, field=CONTENT_HASH_KEY) as executor: driver.attach(executor=executor, runtime=None) driver._traverse_apply(docs1) with pytest.raises(NotImplementedError): driver._traverse_apply(docs2) assert executor.size == 1 filename = executor.save_abspath # update old_doc = Document(id=9999) old_doc.text = 'blabla' old_doc.update_content_hash() new_string = 'blabla-new' doc1.text = new_string doc1.update_content_hash() with BaseExecutor.load(filename) as executor: executor.update([UniqueId(1)], [doc1.content_hash]) with BaseExecutor.load(filename) as executor: assert executor.query(doc1.content_hash) is True assert executor.query(old_doc.content_hash) is None # delete with BaseExecutor.load(filename) as executor: executor.delete([UniqueId(doc1.id)]) with BaseExecutor.load(filename) as executor: assert executor.query(doc1.content_hash) is None
def test_save_and_load_config(self): encoder = self.get_encoder() if encoder is None: return encoder.save_config() self.assertTrue(os.path.exists(encoder.config_abspath)) encoder_loaded = BaseExecutor.load_config(encoder.config_abspath) self.assertEqual(encoder_loaded.channel_axis, encoder.channel_axis)
def test_share_workspace(tmpdir, replica_id): with BaseExecutor.load_config('yaml/test-workspace.yml', True, replica_id) as executor: executor.touch() executor_dir = tmpdir.join( f'{executor.name}-{replica_id}-{executor.name}.bin') executor.save(executor_dir) assert os.path.exists(executor_dir)
def get_indexers(tmpdir): indexer_from_constructor = LevelDBIndexer(level='doc', index_filename=Path(tmpdir) / 'leveldb.db') from jina.executors import BaseExecutor indexer_from_config = BaseExecutor.load_config( str(cur_dir / 'yaml/test-leveldb.yml')) return indexer_from_constructor, indexer_from_config
def test_cache_content_driver_same_content(tmpdir, test_metas): doc1 = Document(id='1') doc1.text = 'blabla' doc1.update_content_hash() docs1 = DocumentSet([doc1]) doc2 = Document(id='2') doc2.text = 'blabla' doc2.update_content_hash() docs2 = DocumentSet([doc2]) assert doc1.content_hash == doc2.content_hash driver = MockBaseCacheDriver() with DocCache(tmpdir, metas=test_metas, fields=(CONTENT_HASH_KEY, )) as executor: driver.attach(executor=executor, runtime=None) driver._apply_all(docs1) with pytest.raises(NotImplementedError): driver._apply_all(docs2) assert executor.size == 1 filename = executor.save_abspath # update old_doc = Document(id=9999) old_doc.text = 'blabla' old_doc.update_content_hash() new_string = 'blabla-new' doc1.text = new_string doc1.update_content_hash() with BaseExecutor.load(filename) as executor: executor.update(['1'], [doc1.content_hash]) with BaseExecutor.load(filename) as executor: assert executor.query(doc1.content_hash) is True assert executor.query(old_doc.content_hash) is False # delete with BaseExecutor.load(filename) as executor: executor.delete([doc1.id]) with BaseExecutor.load(filename) as executor: assert executor.query(doc1.content_hash) is False
def test_save_and_load_config(): encoder = OneHotTextEncoder(workspace=os.environ['TEST_WORKDIR']) encoder.save_config() assert os.path.exists(encoder.config_abspath) encoder_loaded = BaseExecutor.load_config(encoder.config_abspath) assert encoder_loaded.dim == encoder.dim add_tmpfile(encoder_loaded.config_abspath, encoder_loaded.save_abspath)
def test_pod_new_api_from_kwargs(self): a = BaseExecutor.load_config('mwu-encoder/mwu_encoder_driver.yml') assert a._drivers['ControlRequest'][ 0].__class__.__name__ == 'MyAwesomeDriver' with Pod(uses=os.path.join(cur_dir, 'mwu-encoder/mwu_encoder_driver.yml')): # will print a cust msg from the driver when terminate pass
def test_save_and_load_config(self): encoder = OneHotTextEncoder(workspace=os.environ['TEST_WORKDIR']) encoder.save_config() self.assertTrue(os.path.exists(encoder.config_abspath)) encoder_loaded = BaseExecutor.load_config(encoder.config_abspath) self.assertEqual(encoder_loaded.dim, encoder.dim) self.add_tmpfile(encoder_loaded.config_abspath, encoder_loaded.save_abspath)
def test_pod_new_api_from_kwargs(): a = BaseExecutor.load_config( str(cur_dir / 'mwu-encoder/mwu_encoder_driver.yml')) assert a._drivers['ControlRequest'][ 0].__class__.__name__ == 'MyAwesomeDriver' with Pod(uses=str(cur_dir / 'mwu-encoder/mwu_encoder_driver.yml')): # will print a cust task_name from the driver when terminate pass
def test_textpaddlehubencoder_save_and_load(mocker): encoder = TextPaddlehubEncoder() encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) assert encoder_loaded.model_name == encoder.model_name add_tmpfile(encoder.save_abspath, encoder.config_abspath) teardown()
def test_share_workspace(self): for j in range(3): a = BaseExecutor.load_config('yaml/test-workspace.yml', True, j) a.touch() a.save() self.assertTrue( os.path.exists('%s-%s/%s.bin' % (a.name, j, a.name))) self.add_tmpfile('%s-%s/%s.bin' % (a.name, j, a.name)) self.add_tmpfile('%s-%s' % (a.name, j))
def test_incremental_indexing_sequential_indexers(random_workspace): total_docs = 20 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml')).add( uses=os.path.join(cur_dir, 'uniq_docindexer.yml'))) with f: f.index(duplicate_docs[:10]) f.index(duplicate_docs) with BaseExecutor.load(random_workspace / 'vec_idx.bin') as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load(random_workspace / 'doc_idx.bin') as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_joint_indexer(self): b = BaseExecutor.load_config('yaml/test-joint.yml') print(b[0].name) print(type(b[0])) print(b._drivers['SearchRequest'][0]._executor_name) print(b._drivers['SearchRequest']) b.attach(pea=None) self.assertEqual(b._drivers['SearchRequest'][0]._exec, b[0]) self.assertEqual(b._drivers['SearchRequest'][-1]._exec, b[1])
def test_share_workspace(self): for j in range(3): a = BaseExecutor.load_config( os.path.join(cur_dir, 'yaml/test-workspace.yml'), True, j) a.touch() a.save() self.assertTrue(os.path.exists(f'{a.name}-{j}/{a.name}.bin')) self.add_tmpfile(f'{a.name}-{j}/{a.name}.bin') self.add_tmpfile(f'{a.name}-{j}')
def test_compound_from_yaml(self): a = BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/npvec.yml')) for c in a.components: self.add_tmpfile(c.index_abspath) self.assertTrue(isinstance(a, CompoundExecutor)) self.assertTrue(callable(getattr(a, 'add'))) self.assertTrue(callable(getattr(a, 'query'))) self.assertTrue(callable(getattr(a, 'meta_add'))) self.assertTrue(callable(getattr(a, 'meta_query')))
def test_joint_indexer(): b = BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/test-joint.yml')) print(b[0].name) print(type(b[0])) print(b._drivers['SearchRequest'][0]._executor_name) print(b._drivers['SearchRequest']) b.attach(pea=None) assert b._drivers['SearchRequest'][0]._exec == b[0] assert b._drivers['SearchRequest'][-1]._exec == b[1]
def test_load_external(self): from jina.executors import BaseExecutor self.assertRaises(ruamel.yaml.constructor.ConstructorError, BaseExecutor.load_config, os.path.join(cur_dir, 'yaml/dummy_ext_exec.yml')) b = BaseExecutor.load_config( os.path.join(cur_dir, 'yaml/dummy_ext_exec_sucess.yml')) self.assertEqual(b.__class__.__name__, 'DummyExternalIndexer')
def test_shard_workspace(test_workspace, pea_id): tmpdir = os.environ['JINA_TEST_WORKSPACE'] with BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/test-workspace.yml'), pea_id=pea_id) as executor: executor.index_filename = 'index_filename' executor.touch() if pea_id > 0: assert os.path.exists( os.path.join(tmpdir, f'{executor.name}-{executor.pea_id}', f'{executor.name}.bin')) else: assert os.path.exists(os.path.join(tmpdir, f'{executor.name}.bin')) with BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/test-workspace.yml'), pea_id=pea_id) as executor: assert executor.index_filename == 'index_filename'
def test_share_workspace(tmpdir, pea_id): with BaseExecutor.load_config('yaml/test-workspace.yml', separated_workspace=True, pea_id=pea_id) as executor: executor.touch() executor_dir = Path( tmpdir) / f'{executor.name}-{pea_id}-{executor.name}.bin' executor.save(str(executor_dir)) assert executor_dir.exists()
def test_compound_from_yaml(): a = BaseExecutor.load_config(str(cur_dir / 'yaml/npvec.yml')) assert isinstance(a, CompoundExecutor) assert callable(getattr(a, 'add')) assert callable(getattr(a, 'query')) assert callable(getattr(a, 'meta_add')) assert callable(getattr(a, 'meta_query')) rm_files([c.index_abspath for c in a.components]) rm_files(['test-workspace'])
def test_save_and_load(*args, **kwargs): encoder = FlairTextEncoder(embeddings=('word:glove', ), pooling_strategy='mean') encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) assert encoder_loaded.embeddings == encoder.embeddings rm_files([encoder.config_abspath, encoder.save_abspath])
def test_videopaddlehubencoder_save_and_load(*args, **kwargs): encoder = get_encoder() encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) assert encoder_loaded.model_name == encoder.model_name add_tmpfile(encoder.save_abspath, encoder.config_abspath) teardown()
def test_load_cust_with_driver(self): a = BaseExecutor.load_config('mwu-encoder/mwu_encoder_driver.yml') self.assertEqual(a._drivers['ControlRequest'][0].__class__.__name__, 'MyAwesomeDriver') p = set_pod_parser().parse_args( ['--yaml-path', 'mwu-encoder/mwu_encoder_driver.yml']) with Pod(p): # will print a cust msg from the driver when terminate pass
def test_incremental_indexing_parallel_indexers_with_shards( random_workspace, restful): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 # can't use plain _unique in uses_before because workspace will conflict with other f = (Flow(restful=restful).add( uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_vec.yml'), shards=num_shards, name='inc_vec', separated_workspace=True).add( uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_doc.yml'), shards=num_shards, name='inc_doc', needs=['gateway'], separated_workspace=True).add(needs=['inc_vec', 'inc_doc'])) with f: f.index(duplicate_docs[:500]) with f: f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'vec_idx-{shard_idx + 1}' / 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'doc_idx-{shard_idx + 1}' / 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs
def test_save_and_load(encoder): test_data = np.random.rand(num_samples, 3, input_dim, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.raw_model_path == encoder.raw_model_path np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_incremental_indexing_parallel_indexers_with_shards(tmpdir): os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE'] = str(tmpdir) total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before='_unique', shards=num_shards, name='inc_vec', separated_workspace=True).add( uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before='_unique', shards=num_shards, name='inc_doc', needs=['gateway'], separated_workspace=True).add( uses='_merge', needs=['inc_vec', 'inc_doc'])) with f: f.index(duplicate_docs[:500]) f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = os.path.join(tmpdir, f'vec_idx-{shard_idx + 1}', 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = os.path.join(tmpdir, f'doc_idx-{shard_idx + 1}', 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs del os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE']
def test_indexer_ref_indexer(test_workspace, pea_id): tmpdir = os.environ['JINA_TEST_WORKSPACE'] with BaseExecutor.load_config(os.path.join( cur_dir, 'yaml/test-indexer-workspace.yml'), pea_id=pea_id) as ref_indexer: ref_indexer.num_dim = 512 ref_indexer.touch() if pea_id > 0: assert os.path.exists( os.path.join(tmpdir, f'{ref_indexer.name}-{ref_indexer.pea_id}', f'{ref_indexer.name}.bin')) else: assert os.path.exists(os.path.join(tmpdir, f'{ref_indexer.name}.bin')) with BaseExecutor.load_config(os.path.join( cur_dir, 'yaml/test-refindexer-workspace.yml'), pea_id=pea_id) as indexer: assert indexer.num_dim == 512
def test_save_and_load(metas, train_data, test_data, target_output_dim): encoder = get_encoder(metas, train_data, target_output_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) np.testing.assert_array_equal(encoded_data_test, encoded_data_control)
def test_save_load_config(tmp_path): from jina.executors import BaseExecutor from jina.executors.metas import get_default_metas transforms = [{'RandomVerticalFlip': dict(p=1.0)}] metas = get_default_metas() metas['workspace'] = str(tmp_path) orig_crafter = ImageTorchTransformation(transforms, metas=metas) orig_crafter.save_config() orig_trs = orig_crafter.transforms_specification load_crafter1 = BaseExecutor.load_config( os.path.join(cur_dir, '../tests/config.yaml')) load_crafter2 = BaseExecutor.load_config(orig_crafter.config_abspath) assert orig_trs == load_crafter1.transforms_specification assert orig_trs == load_crafter2.transforms_specification
def test_save_and_load(self): encoder = self.get_encoder() data = np.random.rand(1, 784) encoded_data_control = encoder.encode(data) encoder.touch() encoder.save() self.assertTrue(os.path.exists(encoder.save_abspath)) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(data) np.testing.assert_array_equal(encoded_data_control, encoded_data_test)