def test_multiple_queries(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: # Test for suppressed re-indexing a1 = "Ted can eat an entire block of cheese." a2 = "Joe can drink an entire glass of water." _ = index.insert_many([a1, a2]) index.embed().wait() qs1 = ["Who can eat the most cheese", "Who can run the fastest?"] search_results = index.search(qs1) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs1[0] qs2 = ["Who can tie a shoe?", "Who can drink the most water?"] search_results = index.search(qs2) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a2 assert search_results.data.items[0].value.query == qs2[1] qs3 = ["What can Ted do?", "What can Sam do?", "What can Jerry do?"] search_results = index.search(qs3) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs3[0] qs3 = ["What can Sam do?", "What can Ted do?", "What can Jerry do?"] search_results = index.search(qs3) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 assert search_results.data.items[0].value.query == qs3[1] index.create_snapshot().wait() a3 = "Susan can run very fast." a4 = "Brenda can fight alligators." _ = index.insert_many([a3, a4]) index.embed().wait() qs4 = ["What can Brenda do?", "What can Ronaldo do?", "What can Jerry do?"] search_results = index.search(qs4) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a4 assert search_results.data.items[0].value.query == qs4[0] qs4 = [ "What can Brenda do?", "Who should run a marathon?", "What can Jerry do?", ] search_results = index.search(qs4, k=2) assert len(search_results.data.items) == 2 assert search_results.data.items[0].value.value == a4 assert search_results.data.items[0].value.query == qs4[0] assert search_results.data.items[1].value.value == a3 assert search_results.data.items[1].value.query == qs4[1]
def test_index_usage(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: a1 = "Ted can eat an entire block of cheese." q1 = "Who can eat the most cheese" _ = index.insert(a1) _ = index.search(q1) # Now embed task = index.embed() task.wait() task.refresh() assert task.task.state == TaskState.succeeded search_results = index.search(q1) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.value == a1 # Associate metadata a2 = "Armadillo shells are bulletproof." q2 = "What is something interesting about Armadillos?" a2id = "A2id" a2type = "A2type" a2metadata = dict( id=a2id, idid=f"{a2id}{a2id}", boolVal=True, intVal=123, floatVal=1.2, ) _ = index.insert(a2, external_id=a2id, external_type=a2type, metadata=a2metadata) search_results2 = index.search(q2) assert len(search_results2.data.items) == 1 assert search_results2.data.items[0].value.value == a2 assert search_results2.data.items[0].value.external_id is None assert search_results2.data.items[0].value.external_type is None assert search_results2.data.items[0].value.metadata is None search_results3 = index.search(q2, include_metadata=True) assert len(search_results3.data.items) == 1 assert search_results3.data.items[0].value.value == a2 assert search_results3.data.items[0].value.external_id == a2id assert search_results3.data.items[0].value.external_type == a2type assert search_results3.data.items[0].value.metadata == a2metadata # Because I don't know pytest enough to fully trust the dict comparison.. assert search_results3.data.items[0].value.metadata["id"] == a2id assert search_results3.data.items[0].value.metadata["idid"] == "{}{}".format(a2id, a2id) search_results4 = index.search(q2, k=10) assert len(search_results4.data.items) == 2 assert search_results4.data.items[0].value.value == a2 assert search_results4.data.items[1].value.value == a1
def test_duplicate_inserts(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: # Test for suppressed re-indexing a1 = "Ted can eat an entire block of cheese." q1 = "Who can eat the most cheese" _ = index.insert(a1) _ = index.search(q1)
def test_file_embed_lookup(): steamship = get_steamship_client() content_a = "Ted likes to run." content_b = "Grace likes to bike." file = steamship.upload(content=content_a, mime_type=MimeTypes.MKD).data blockify_res = file.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_res.error is None blockify_res.wait() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_res = file.tag(plugin_instance=parser.handle) assert parse_res.error is None parse_res.wait() b = steamship.upload(content=content_b, mime_type=MimeTypes.MKD).data blockify_res = b.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_res.error is None blockify_res.wait() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_res = b.tag(plugin_instance=parser.handle) assert parse_res.error is None parse_res.wait() embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data # Now we add the file to the index with random_index(steamship, embedder.handle) as index: index.insert_file(file.id, block_type="sentence", reindex=True) index.insert_file(b.id, block_type="sentence", reindex=True) res = index.search("What does Ted like to do?").data assert len(res.items) == 1 assert res.items[0].value.value == content_a res = index.search("What does Grace like to do?").data assert len(res.items) == 1 assert res.items[0].value.value == content_b # Now we list the items itemsa = index.list_items(file_id=file.id).data assert len(itemsa.items) == 1 assert len(itemsa.items[0].embedding) > 0 assert itemsa.items[0].value == content_a itemsb = index.list_items(file_id=b.id).data assert len(itemsb.items) == 1 assert len(itemsb.items[0].embedding) > 0 assert len(itemsb.items[0].embedding) == len(itemsa.items[0].embedding) assert itemsb.items[0].value == content_b
def test_embed_task(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: _ = index.insert("test", reindex=False) res = index.embed() assert res.task.task_id is not None assert res.task.state is not None assert res.task.task_created_on is not None assert res.task.task_last_modified_on is not None assert res.task.state == TaskState.waiting res.wait() assert res.task.state == TaskState.succeeded
def test_file_parse(): steamship = get_steamship_client() content1 = "# {}\n\n{} {}\n\n{} {}".format(T, P1_1, P1_2, P2_1, P2_2) content2 = "# {}\n\n{} {}\n\n{} {}".format(T2, P3_1, P3_2, P4_1, P4_2) content = "{}\n\n{}".format(content1, content2) file = steamship.upload(content=content, mime_type=MimeTypes.MKD).data assert file.id is not None assert file.mime_type == MimeTypes.MKD blockify_resp = file.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_resp.error is None blockify_resp.wait() # Now we parse parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_resp = file.tag(plugin_instance=parser.handle) assert parse_resp.error is None parse_resp.wait() # Now the sentences should be parsed! q2 = file.refresh().data assert len(q2.blocks) == 6 # Now we add the file to the index plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance=plugin_instance.handle) as index: index.insert_file(file.id, reindex=False) embed_resp = index.embed() assert embed_resp.error is None embed_resp.wait() res = index.search("What color are roses?").data assert len(res.items) == 1 # Because the simdex now indexes entire blocks and not sentences, the result of this is the whole block text assert res.items[0].value.value == " ".join([P1_1, P1_2]) file.delete()
def test_insert_many(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) item2 = EmbeddedItem( value="Rocket Ship", external_id="space", external_type="vehicle", metadata="Foo", ) index.insert_many([item1, item2]) index.embed().wait() task = index.list_items() assert task.error is None index_items = task.data assert len(index_items.items) == 2 assert len(index_items.items[0].embedding) > 0 assert len(index_items.items[1].embedding) > 0 assert len(index_items.items[0].embedding) == len(index_items.items[1].embedding) res = index.search(item1.value, include_metadata=True, k=100) assert res.data.items is not None assert len(res.data.items) == 2 assert res.data.items[0].value.value == item1.value assert res.data.items[0].value.external_id == item1.external_id assert res.data.items[0].value.external_type == item1.external_type _list_equal(res.data.items[0].value.metadata, item1.metadata) res = index.search(item2.value, include_metadata=True) assert res.data.items is not None assert res.data.items[0].value.value == item2.value assert res.data.items[0].value.external_id == item2.external_id assert res.data.items[0].value.external_type == item2.external_type assert res.data.items[0].value.metadata == item2.metadata
def test_empty_queries(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance.handle) as index: a1 = "Ted can eat an entire block of cheese." a2 = "Joe can drink an entire glass of water." _ = index.insert_many([a1, a2]) index.embed().wait() search_results = index.search(None) assert search_results.error is not None # These technically don't count as empty. Leaving this test in here # to encode and capture that in case we want to change it. search_results = index.search([]) # noinspection PyUnresolvedReferences assert len(search_results.data.items) == 0 search_results = index.search("") # noinspection PyUnresolvedReferences assert len(search_results.data.items) == 1
def test_task_comment_feedback_reporting(): """ We want to be able to generate reports like this: Select Across Gorup -- externalGroup Inputs Seen: XXX -- Distinct externalId Inputs Suggested: YYY -- Add to metadata Inputs Liked / Disliked / Used -- Add to metadata So really we just need to test the group aggregation """ client = get_steamship_client() embedder = PluginInstance.create(client, plugin_handle="test-embedder").data with random_index(client, plugin_instance=embedder.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) group_name_1 = random_name() group_name_2 = random_name() index.insert( item1.value, external_id=item1.external_id, external_type=item1.external_type, metadata=item1.metadata, ) task = index.embed() task.wait() res = index.search(item1.value, include_metadata=True, k=1) res.task.add_comment( external_id="Foo1", external_type="Bar1", external_group=group_name_1, metadata=[1, 2, 3], ) res.task.add_comment( external_id="Foo2", external_type="Bar1", external_group=group_name_1, metadata=[1, 2, 3], ) res.task.add_comment( external_id="Foo2", external_type="Bar1", external_group=group_name_2, metadata=[1, 2, 3], ) comments = res.task.list_comments() assert len(comments.data.comments) == 3 g1 = client.list_comments(external_group=group_name_1) assert len(g1.data.comments) == 2 g2 = client.list_comments(external_group=group_name_2) assert len(g2.data.comments) == 1 g1 = client.list_comments(task_id=res.task.task_id, external_group=group_name_1) assert len(g1.data.comments) == 2 g2 = client.list_comments(task_id=res.task.task_id, external_group=group_name_2) assert len(g2.data.comments) == 1 g1 = client.list_comments( task_id=res.task.task_id, external_id="Foo1", external_group=group_name_1 ) assert len(g1.data.comments) == 1 g2 = client.list_comments( task_id=res.task.task_id, external_id="Foo1", external_group=group_name_2 ) assert len(g2.data.comments) == 0 comments.data.comments[0].delete() comments.data.comments[1].delete() comments.data.comments[2].delete() g1 = client.list_comments(external_group=group_name_1) assert len(g1.data.comments) == 0 g2 = client.list_comments(external_group=group_name_2) assert len(g2.data.comments) == 0
def test_basic_task_comment(): steamship = get_steamship_client() embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data with random_index(steamship, embedder.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) index.insert( item1.value, external_id=item1.external_id, external_type=item1.external_type, metadata=item1.metadata, ) task = index.embed() task.wait() res2 = index.search(item1.value, include_metadata=True, k=1) res2.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2]) # We don't return to Res2 until the end to make sure we aren't co-mingling comments! res = index.search(item1.value, include_metadata=True, k=1) assert res.data.items is not None assert len(res.data.items) == 1 assert res.data.items[0].value.value == item1.value assert res.data.items[0].value.external_id == item1.external_id assert res.data.items[0].value.external_type == item1.external_type _list_equal(res.data.items[0].value.metadata, item1.metadata) res.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2]) comments = res.task.list_comments() assert len(comments.data.comments) == 1 comment = comments.data.comments[0] assert comment.external_id == "Foo" assert comment.external_type == "Bar" _list_equal(comment.metadata, [1, 2]) comment.delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0 # Now let's add one res.task.add_comment(external_id="Foo1", external_type="Bar1", metadata=[1, 2, 3]) res.task.add_comment(external_id="Foo2", external_type="Bar2", metadata=[1, 2, 3, 4]) comments = res.task.list_comments() assert len(comments.data.comments) == 2 comment = comments.data.comments[0] assert comment.external_id == "Foo1" assert comment.external_type == "Bar1" _list_equal(comment.metadata, [1, 2, 3]) comment = comments.data.comments[1] assert comment.external_id == "Foo2" assert comment.external_type == "Bar2" _list_equal(comment.metadata, [1, 2, 3, 4]) comments.data.comments[0].delete() comments.data.comments[1].delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0 # Now we handle res2 comments = res2.task.list_comments() assert len(comments.data.comments) == 1 comment = comments.data.comments[0] assert comment.external_id == "Foo" assert comment.external_type == "Bar" _list_equal(comment.metadata, [1, 2]) comments.data.comments[0].delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0