def test_multiple_queries(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: # Test for suppressed re-indexing a1 = "Ted can eat an entire block of cheese." a2 = "Joe can drink an entire glass of water." _ = index.insert_many([a1, a2]) index.embed().wait() qs1 = ["Who can eat the most cheese", "Who can run the fastest?"] search_results = index.search(qs1) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs1[0] qs2 = ["Who can tie a shoe?", "Who can drink the most water?"] search_results = index.search(qs2) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a2 assert search_results.data.items[0].value.query == qs2[1] qs3 = ["What can Ted do?", "What can Sam do?", "What can Jerry do?"] search_results = index.search(qs3) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs3[0] qs3 = ["What can Sam do?", "What can Ted do?", "What can Jerry do?"] search_results = index.search(qs3) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs3[1] index.create_snapshot().wait() a3 = "Susan can run very fast." a4 = "Brenda can fight alligators." _ = index.insert_many([a3, a4]) index.embed().wait() qs4 = ["What can Brenda do?", "What can Ronaldo do?", "What can Jerry do?"] search_results = index.search(qs4) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a4 assert search_results.data.items[0].value.query == qs4[0] qs4 = [ "What can Brenda do?", "Who should run a marathon?", "What can Jerry do?", ] search_results = index.search(qs4, k=2) assert len(search_results.data.items) == 2 assert search_results.data.items[0].value.value == a4 assert search_results.data.items[0].value.query == qs4[0] assert search_results.data.items[1].value.value == a3 assert search_results.data.items[1].value.query == qs4[1]
def test_index_usage(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: a1 = "Ted can eat an entire block of cheese." q1 = "Who can eat the most cheese" _ = index.insert(a1) _ = index.search(q1) # Now embed task = index.embed() task.wait() task.refresh() assert task.task.state == TaskState.succeeded search_results = index.search(q1) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 # Associate metadata a2 = "Armadillo shells are bulletproof." q2 = "What is something interesting about Armadillos?" a2id = "A2id" a2type = "A2type" a2metadata = dict( id=a2id, idid=f"{a2id}{a2id}", boolVal=True, intVal=123, floatVal=1.2, ) _ = index.insert(a2, external_id=a2id, external_type=a2type, metadata=a2metadata) search_results2 = index.search(q2) assert len(search_results2.data.items) == 1 assert search_results2.data.items[0].value.value == a2 assert search_results2.data.items[0].value.external_id is None assert search_results2.data.items[0].value.external_type is None assert search_results2.data.items[0].value.metadata is None search_results3 = index.search(q2, include_metadata=True) assert len(search_results3.data.items) == 1 assert search_results3.data.items[0].value.value == a2 assert search_results3.data.items[0].value.external_id == a2id assert search_results3.data.items[0].value.external_type == a2type assert search_results3.data.items[0].value.metadata == a2metadata # Because I don't know pytest enough to fully trust the dict comparison.. assert search_results3.data.items[0].value.metadata["id"] == a2id assert search_results3.data.items[0].value.metadata["idid"] == "{}{}".format(a2id, a2id) search_results4 = index.search(q2, k=10) assert len(search_results4.data.items) == 2 assert search_results4.data.items[0].value.value == a2 assert search_results4.data.items[1].value.value == a1
def test_duplicate_inserts(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: # Test for suppressed re-indexing a1 = "Ted can eat an entire block of cheese." q1 = "Who can eat the most cheese" _ = index.insert(a1) _ = index.search(q1)
def test_e2e_corpus_export_with_query(client): exporter_plugin_r = PluginInstance.create( client=client, handle=EXPORTER_HANDLE, plugin_handle=EXPORTER_HANDLE, upsert=True, ) assert exporter_plugin_r.data is not None exporter_plugin = exporter_plugin_r.data assert exporter_plugin.handle is not None a = File.create( client=client, blocks=[ Block.CreateRequest(text="A", tags=[Tag.CreateRequest(name="BlockTag")]), Block.CreateRequest(text="B"), ], ).data assert a.id is not None b = File.create( client=client, blocks=[Block.CreateRequest(text="A"), Block.CreateRequest(text="B")], tags=[Tag.CreateRequest(name="FileTag")], ).data assert b.id is not None # Now export the corpus _input = ExportPluginInput(query='filetag and name "FileTag"', type="file") raw_data_r = exporter_plugin.export(_input) assert raw_data_r is not None # The results of a corpus exporter are MD5 encoded! raw_data_r.wait() raw_data = raw_data_r.data.data # decode base64 to get URL at url json property decoded_data = json.loads(base64.b64decode(raw_data)) url = decoded_data["url"] # fetch the URL via requests.get content = requests.get(url).text # Look at lines of jsonl file files = [File.parse_obj(json.loads(line)) for line in content.splitlines()] assert len(files) == 1 assert len(files[0].tags) == 1 a.delete() b.delete()
def deploy_plugin( client: Steamship, py_path: Path, plugin_type: str, training_platform: Optional[HostingType] = None, version_config_template: Dict[str, Any] = None, instance_config: Dict[str, Any] = None, space_id: Optional[str] = None, ): plugin = Plugin.create( client, training_platform=training_platform, type_=plugin_type, transport="jsonOverHttp", description="A Plugin (python client tests)", is_public=False, ) assert plugin.error is None assert plugin.data is not None plugin = plugin.data zip_bytes = zip_deployable(py_path) version = PluginVersion.create( client, "test-version", plugin_id=plugin.id, filebytes=zip_bytes, config_template=version_config_template, ) # TODO: This is due to having to wait for the lambda to finish deploying. # TODO: We should update the task system to allow its .wait() to depend on this. version = _wait_for_version(version) instance = PluginInstance.create( client, space_id=space_id, plugin_id=plugin.id, plugin_version_id=version.id, config=instance_config, ) instance = _wait_for_instance(instance) assert instance.plugin_id == plugin.id assert instance.plugin_version_id == version.id _check_user(client, instance) yield plugin, version, instance _delete_deployable(instance, version, plugin)
def test_embed_task(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: _ = index.insert("test", reindex=False) res = index.embed() assert res.task.task_id is not None assert res.task.state is not None assert res.task.task_created_on is not None assert res.task.task_last_modified_on is not None assert res.task.state == TaskState.waiting res.wait() assert res.task.state == TaskState.succeeded
def test_e2e_corpus_export(client: Steamship): version_config_template = dict( text_column=dict(type="string"), tag_columns=dict(type="string"), tag_kind=dict(type="string"), ) # TODO (enias): Derive this from Config instance_config = dict( # Has to match up text_column="Message", tag_columns="Category", tag_kind="Intent", ) exporter_plugin_r = PluginInstance.create( client=client, handle=EXPORTER_HANDLE, plugin_handle=EXPORTER_HANDLE, upsert=True, ) assert exporter_plugin_r.data is not None exporter_plugin = exporter_plugin_r.data assert exporter_plugin.handle is not None _input = ExportPluginInput(handle="default", type="file") csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py" # Make a blockifier which will generate our trainable corpus with deploy_plugin( client, csv_blockifier_path, "blockifier", version_config_template=version_config_template, instance_config=instance_config, ) as (plugin, version, instance): with upload_file(client, "utterances.csv") as file: assert len(file.refresh().data.blocks) == 0 # Use the plugin we just registered file.blockify(plugin_instance=instance.handle).wait() assert len(file.refresh().data.blocks) == 5 # Now export the corpus raw_data_r = exporter_plugin.export(_input) assert raw_data_r is not None # The results of a corpus exporter are MD5 encoded! _ = raw_data_r.data
def test_delete_index(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data index = steamship.create_index(plugin_instance=plugin_instance.handle).data assert index.id is not None task = steamship.create_index(handle=index.handle, plugin_instance=plugin_instance.handle) assert task.error is None index2 = task.data assert index.id == index2.id index.delete() task = steamship.create_index(plugin_instance=plugin_instance.handle) assert task.error is None assert task.data is not None index3 = task.data assert index.id != index3.id index3.delete()
def test_insert_many(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) item2 = EmbeddedItem( value="Rocket Ship", external_id="space", external_type="vehicle", metadata="Foo", ) index.insert_many([item1, item2]) index.embed().wait() task = index.list_items() assert task.error is None index_items = task.data assert len(index_items.items) == 2 assert len(index_items.items[0].embedding) > 0 assert len(index_items.items[1].embedding) > 0 assert len(index_items.items[0].embedding) == len(index_items.items[1].embedding) res = index.search(item1.value, include_metadata=True, k=100) assert res.data.items is not None assert len(res.data.items) == 2 assert res.data.items[0].value.value == item1.value assert res.data.items[0].value.external_id == item1.external_id assert res.data.items[0].value.external_type == item1.external_type _list_equal(res.data.items[0].value.metadata, item1.metadata) res = index.search(item2.value, include_metadata=True) assert res.data.items is not None assert res.data.items[0].value.value == item2.value assert res.data.items[0].value.external_id == item2.external_id assert res.data.items[0].value.external_type == item2.external_type assert res.data.items[0].value.metadata == item2.metadata
def test_empty_queries(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: a1 = "Ted can eat an entire block of cheese." a2 = "Joe can drink an entire glass of water." _ = index.insert_many([a1, a2]) index.embed().wait() search_results = index.search(None) assert search_results.error is not None # These technically don't count as empty. Leaving this test in here # to encode and capture that in case we want to change it. search_results = index.search([]) # noinspection PyUnresolvedReferences assert len(search_results.data.items) == 0 search_results = index.search("") # noinspection PyUnresolvedReferences assert len(search_results.data.items) == 1
def test_e2e_third_party_trainable_tagger_lambda_training(): client = get_steamship_client() spaceR = Space.get(client) # TODO (enias): Remove assert spaceR.data is not None exporter_plugin_r = PluginInstance.create( client=client, handle=EXPORTER_HANDLE, plugin_handle=EXPORTER_HANDLE, upsert=True, # Don't care if it already exists ) assert exporter_plugin_r.data is not None exporter_plugin = exporter_plugin_r.data assert exporter_plugin.handle is not None third_party_trainable_tagger_path = ( PLUGINS_PATH / "taggers" / "plugin_third_party_trainable_tagger.py") # Note that we're going to do the below training on ZERO data for simplicity. # The particular test model doesn't actually incorporate any data given to it at training time, so # it would just slow the test down to create, blockify, and export a training corpus. with deploy_plugin( client, third_party_trainable_tagger_path, "tagger", training_platform=HostingType.LAMBDA) as (tagger, tagger_version, tagger_instance): # Now train the plugin training_request = TrainingParameterPluginInput( plugin_instance=tagger_instance.handle, export_plugin_input=ExportPluginInput( plugin_instance=exporter_plugin.handle, type="file", query="all"), ) train_result = tagger_instance.train(training_request) train_result.wait() assert train_result.data is not None output = train_result.data assert output.training_complete assert output.training_reference_data is not None assert output.training_reference_data["num_checkins"] == 3 logging.info("Waiting 15 seconds for instance to deploy.") import time time.sleep(15) # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag every block with # the parameters `MockClient.LABELS` # First we'll create a file test_doc = "Hi there" res = tagger_instance.tag(doc=test_doc) res.wait() assert res.error is None assert res.data is not None assert res.data.file is not None assert not res.data.file.tags assert res.data.file.blocks is not None assert len(res.data.file.blocks) > 0 for block in res.data.file.blocks: assert block.tags is not None assert sorted([tag.name for tag in block.tags]) == sorted(MockClient.LABELS)
def test_e2e_trainable_tagger_lambda_training(client: Steamship): version_config_template = dict( text_column=dict(type="string"), tag_columns=dict(type="string"), tag_kind=dict(type="string"), ) instance_config = dict(text_column="Message", tag_columns="Category", tag_kind="Intent") exporter_plugin_r = PluginInstance.create( client=client, handle=EXPORTER_HANDLE, plugin_handle=EXPORTER_HANDLE, upsert=True, ) assert exporter_plugin_r.data is not None exporter_plugin = exporter_plugin_r.data assert exporter_plugin.handle is not None csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py" trainable_tagger_path = PLUGINS_PATH / "taggers" / "plugin_trainable_tagger.py" # Make a blockifier which will generate our trainable corpus with deploy_plugin( client, csv_blockifier_path, "blockifier", version_config_template=version_config_template, instance_config=instance_config, ) as (plugin, version, instance): with upload_file(client, "utterances.csv") as file: assert len(file.refresh().data.blocks) == 0 # Use the plugin we just registered file.blockify(plugin_instance=instance.handle).wait() assert len(file.refresh().data.blocks) == 5 # Now make a trainable tagger to train on those tags with deploy_plugin( client, trainable_tagger_path, "tagger", training_platform=HostingType.LAMBDA ) as (tagger, tagger_version, tagger_instance): # Now train the plugin training_request = TrainingParameterPluginInput( plugin_instance=tagger_instance.handle, export_plugin_input=ExportPluginInput( plugin_instance=EXPORTER_HANDLE, type="file", query='kind "foo1"' ), training_params=dict( keyword_list=KEYWORDS # This is a key defined by the test model we're training ), ) train_result = tagger_instance.train(training_request) train_result.wait() # At this point, the PluginInstance will have written a parameter file to disk. We should be able to # retrieve it since we know that it is tagged as the `default`. checkpoint = ModelCheckpoint( client=client, handle="default", plugin_instance_id=tagger_instance.id, ) checkpoint_path = checkpoint.download_model_bundle() assert checkpoint_path.exists() keyword_path = Path(checkpoint_path) / TestTrainableTaggerModel.KEYWORD_LIST_FILE assert keyword_path.exists() with open(keyword_path, "r") as f: params = json.loads(f.read()) assert params == KEYWORDS logging.info("Waiting 15 seconds for instance to deploy.") import time time.sleep(15) # If we're here, we have verified that the plugin instance has correctly recorded its parameters # into the pluginData bucket under a path unique to the PluginInstnace/ModelCheckpoint. # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag any file with the # tags that parameter it. Since those tags are (see above) ["product", "coupon"] we should expect # this tagger to apply those tags to any file provided to it. # First we'll create a file test_doc = "Hi there" res = tagger_instance.tag(doc=test_doc) res.wait() assert res.error is None assert res.data is not None assert res.data.file is not None assert res.data.file.tags is not None assert len(res.data.file.tags) == len(KEYWORDS) assert sorted([tag.name for tag in res.data.file.tags]) == sorted(KEYWORDS)
def test_create_index(): client = get_steamship_client() plugin_instance = PluginInstance.create(client, plugin_handle=_TEST_EMBEDDER).data create_index(client, plugin_instance.handle)