Пример #1
0
def test_multiple_queries():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        # Test for suppressed re-indexing
        a1 = "Ted can eat an entire block of cheese."
        a2 = "Joe can drink an entire glass of water."
        _ = index.insert_many([a1, a2])
        index.embed().wait()

        qs1 = ["Who can eat the most cheese", "Who can run the fastest?"]
        search_results = index.search(qs1)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs1[0]

        qs2 = ["Who can tie a shoe?", "Who can drink the most water?"]
        search_results = index.search(qs2)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a2
        assert search_results.data.items[0].value.query == qs2[1]

        qs3 = ["What can Ted do?", "What can Sam do?", "What can Jerry do?"]
        search_results = index.search(qs3)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs3[0]

        qs3 = ["What can Sam do?", "What can Ted do?", "What can Jerry do?"]
        search_results = index.search(qs3)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs3[1]

        index.create_snapshot().wait()

        a3 = "Susan can run very fast."
        a4 = "Brenda can fight alligators."
        _ = index.insert_many([a3, a4])
        index.embed().wait()

        qs4 = ["What can Brenda do?", "What can Ronaldo do?", "What can Jerry do?"]
        search_results = index.search(qs4)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a4
        assert search_results.data.items[0].value.query == qs4[0]

        qs4 = [
            "What can Brenda do?",
            "Who should run a marathon?",
            "What can Jerry do?",
        ]
        search_results = index.search(qs4, k=2)
        assert len(search_results.data.items) == 2
        assert search_results.data.items[0].value.value == a4
        assert search_results.data.items[0].value.query == qs4[0]
        assert search_results.data.items[1].value.value == a3
        assert search_results.data.items[1].value.query == qs4[1]
Пример #2
0
def test_index_usage():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        a1 = "Ted can eat an entire block of cheese."
        q1 = "Who can eat the most cheese"
        _ = index.insert(a1)
        _ = index.search(q1)

        # Now embed
        task = index.embed()
        task.wait()
        task.refresh()
        assert task.task.state == TaskState.succeeded

        search_results = index.search(q1)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1

        # Associate metadata
        a2 = "Armadillo shells are bulletproof."
        q2 = "What is something interesting about Armadillos?"
        a2id = "A2id"
        a2type = "A2type"
        a2metadata = dict(
            id=a2id,
            idid=f"{a2id}{a2id}",
            boolVal=True,
            intVal=123,
            floatVal=1.2,
        )

        _ = index.insert(a2, external_id=a2id, external_type=a2type, metadata=a2metadata)
        search_results2 = index.search(q2)
        assert len(search_results2.data.items) == 1
        assert search_results2.data.items[0].value.value == a2
        assert search_results2.data.items[0].value.external_id is None
        assert search_results2.data.items[0].value.external_type is None
        assert search_results2.data.items[0].value.metadata is None

        search_results3 = index.search(q2, include_metadata=True)
        assert len(search_results3.data.items) == 1
        assert search_results3.data.items[0].value.value == a2
        assert search_results3.data.items[0].value.external_id == a2id
        assert search_results3.data.items[0].value.external_type == a2type

        assert search_results3.data.items[0].value.metadata == a2metadata
        # Because I don't know pytest enough to fully trust the dict comparison..
        assert search_results3.data.items[0].value.metadata["id"] == a2id
        assert search_results3.data.items[0].value.metadata["idid"] == "{}{}".format(a2id, a2id)

        search_results4 = index.search(q2, k=10)
        assert len(search_results4.data.items) == 2
        assert search_results4.data.items[0].value.value == a2
        assert search_results4.data.items[1].value.value == a1
Пример #3
0
def test_duplicate_inserts():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        # Test for suppressed re-indexing
        a1 = "Ted can eat an entire block of cheese."
        q1 = "Who can eat the most cheese"
        _ = index.insert(a1)
        _ = index.search(q1)
Пример #4
0
def test_file_embed_lookup():
    steamship = get_steamship_client()

    content_a = "Ted likes to run."
    content_b = "Grace likes to bike."

    file = steamship.upload(content=content_a, mime_type=MimeTypes.MKD).data

    blockify_res = file.blockify(
        plugin_instance="markdown-blockifier-default-1.0")
    assert blockify_res.error is None
    blockify_res.wait()

    parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data
    parse_res = file.tag(plugin_instance=parser.handle)
    assert parse_res.error is None
    parse_res.wait()

    b = steamship.upload(content=content_b, mime_type=MimeTypes.MKD).data
    blockify_res = b.blockify(
        plugin_instance="markdown-blockifier-default-1.0")
    assert blockify_res.error is None
    blockify_res.wait()

    parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data
    parse_res = b.tag(plugin_instance=parser.handle)
    assert parse_res.error is None
    parse_res.wait()

    embedder = PluginInstance.create(steamship,
                                     plugin_handle="test-embedder").data
    # Now we add the file to the index
    with random_index(steamship, embedder.handle) as index:
        index.insert_file(file.id, block_type="sentence", reindex=True)
        index.insert_file(b.id, block_type="sentence", reindex=True)

        res = index.search("What does Ted like to do?").data
        assert len(res.items) == 1
        assert res.items[0].value.value == content_a

        res = index.search("What does Grace like to do?").data
        assert len(res.items) == 1
        assert res.items[0].value.value == content_b

        # Now we list the items
        itemsa = index.list_items(file_id=file.id).data
        assert len(itemsa.items) == 1
        assert len(itemsa.items[0].embedding) > 0
        assert itemsa.items[0].value == content_a

        itemsb = index.list_items(file_id=b.id).data
        assert len(itemsb.items) == 1
        assert len(itemsb.items[0].embedding) > 0
        assert len(itemsb.items[0].embedding) == len(itemsa.items[0].embedding)
        assert itemsb.items[0].value == content_b
Пример #5
0
def test_embed_task():
    steamship = get_steamship_client()
    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        _ = index.insert("test", reindex=False)
        res = index.embed()

        assert res.task.task_id is not None
        assert res.task.state is not None
        assert res.task.task_created_on is not None
        assert res.task.task_last_modified_on is not None
        assert res.task.state == TaskState.waiting
        res.wait()
        assert res.task.state == TaskState.succeeded
Пример #6
0
def test_file_parse():
    steamship = get_steamship_client()
    content1 = "# {}\n\n{} {}\n\n{} {}".format(T, P1_1, P1_2, P2_1, P2_2)
    content2 = "# {}\n\n{} {}\n\n{} {}".format(T2, P3_1, P3_2, P4_1, P4_2)
    content = "{}\n\n{}".format(content1, content2)

    file = steamship.upload(content=content, mime_type=MimeTypes.MKD).data
    assert file.id is not None
    assert file.mime_type == MimeTypes.MKD

    blockify_resp = file.blockify(
        plugin_instance="markdown-blockifier-default-1.0")
    assert blockify_resp.error is None
    blockify_resp.wait()

    # Now we parse
    parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data
    parse_resp = file.tag(plugin_instance=parser.handle)
    assert parse_resp.error is None
    parse_resp.wait()

    # Now the sentences should be parsed!
    q2 = file.refresh().data
    assert len(q2.blocks) == 6

    # Now we add the file to the index
    plugin_instance = PluginInstance.create(steamship,
                                            plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship,
                      plugin_instance=plugin_instance.handle) as index:
        index.insert_file(file.id, reindex=False)
        embed_resp = index.embed()
        assert embed_resp.error is None
        embed_resp.wait()

        res = index.search("What color are roses?").data
        assert len(res.items) == 1
        # Because the simdex now indexes entire blocks and not sentences, the result of this is the whole block text
        assert res.items[0].value.value == " ".join([P1_1, P1_2])

    file.delete()
Пример #7
0
def test_insert_many():
    steamship = get_steamship_client()
    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        item1 = EmbeddedItem(
            value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3]
        )
        item2 = EmbeddedItem(
            value="Rocket Ship",
            external_id="space",
            external_type="vehicle",
            metadata="Foo",
        )

        index.insert_many([item1, item2])
        index.embed().wait()

        task = index.list_items()
        assert task.error is None
        index_items = task.data
        assert len(index_items.items) == 2
        assert len(index_items.items[0].embedding) > 0
        assert len(index_items.items[1].embedding) > 0
        assert len(index_items.items[0].embedding) == len(index_items.items[1].embedding)

        res = index.search(item1.value, include_metadata=True, k=100)
        assert res.data.items is not None
        assert len(res.data.items) == 2
        assert res.data.items[0].value.value == item1.value
        assert res.data.items[0].value.external_id == item1.external_id
        assert res.data.items[0].value.external_type == item1.external_type
        _list_equal(res.data.items[0].value.metadata, item1.metadata)

        res = index.search(item2.value, include_metadata=True)
        assert res.data.items is not None
        assert res.data.items[0].value.value == item2.value
        assert res.data.items[0].value.external_id == item2.external_id
        assert res.data.items[0].value.external_type == item2.external_type
        assert res.data.items[0].value.metadata == item2.metadata
Пример #8
0
def test_empty_queries():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        a1 = "Ted can eat an entire block of cheese."
        a2 = "Joe can drink an entire glass of water."
        _ = index.insert_many([a1, a2])
        index.embed().wait()

        search_results = index.search(None)
        assert search_results.error is not None

        # These technically don't count as empty. Leaving this test in here
        # to encode and capture that in case we want to change it.
        search_results = index.search([])
        # noinspection PyUnresolvedReferences
        assert len(search_results.data.items) == 0

        search_results = index.search("")
        # noinspection PyUnresolvedReferences
        assert len(search_results.data.items) == 1
Пример #9
0
def test_task_comment_feedback_reporting():
    """
    We want to be able to generate reports like this:

    Select Across Gorup    -- externalGroup
    Inputs Seen: XXX       -- Distinct externalId
    Inputs Suggested: YYY  -- Add to metadata
    Inputs Liked / Disliked / Used -- Add to metadata

    So really we just need to test the group aggregation
    """
    client = get_steamship_client()
    embedder = PluginInstance.create(client, plugin_handle="test-embedder").data
    with random_index(client, plugin_instance=embedder.handle) as index:
        item1 = EmbeddedItem(
            value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3]
        )

        group_name_1 = random_name()
        group_name_2 = random_name()

        index.insert(
            item1.value,
            external_id=item1.external_id,
            external_type=item1.external_type,
            metadata=item1.metadata,
        )
        task = index.embed()
        task.wait()

        res = index.search(item1.value, include_metadata=True, k=1)
        res.task.add_comment(
            external_id="Foo1",
            external_type="Bar1",
            external_group=group_name_1,
            metadata=[1, 2, 3],
        )
        res.task.add_comment(
            external_id="Foo2",
            external_type="Bar1",
            external_group=group_name_1,
            metadata=[1, 2, 3],
        )
        res.task.add_comment(
            external_id="Foo2",
            external_type="Bar1",
            external_group=group_name_2,
            metadata=[1, 2, 3],
        )

        comments = res.task.list_comments()
        assert len(comments.data.comments) == 3

        g1 = client.list_comments(external_group=group_name_1)
        assert len(g1.data.comments) == 2

        g2 = client.list_comments(external_group=group_name_2)
        assert len(g2.data.comments) == 1

        g1 = client.list_comments(task_id=res.task.task_id, external_group=group_name_1)
        assert len(g1.data.comments) == 2

        g2 = client.list_comments(task_id=res.task.task_id, external_group=group_name_2)
        assert len(g2.data.comments) == 1

        g1 = client.list_comments(
            task_id=res.task.task_id, external_id="Foo1", external_group=group_name_1
        )
        assert len(g1.data.comments) == 1

        g2 = client.list_comments(
            task_id=res.task.task_id, external_id="Foo1", external_group=group_name_2
        )
        assert len(g2.data.comments) == 0

        comments.data.comments[0].delete()
        comments.data.comments[1].delete()
        comments.data.comments[2].delete()

        g1 = client.list_comments(external_group=group_name_1)
        assert len(g1.data.comments) == 0

        g2 = client.list_comments(external_group=group_name_2)
        assert len(g2.data.comments) == 0
Пример #10
0
def test_basic_task_comment():
    steamship = get_steamship_client()
    embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data
    with random_index(steamship, embedder.handle) as index:
        item1 = EmbeddedItem(
            value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3]
        )

        index.insert(
            item1.value,
            external_id=item1.external_id,
            external_type=item1.external_type,
            metadata=item1.metadata,
        )
        task = index.embed()
        task.wait()

        res2 = index.search(item1.value, include_metadata=True, k=1)
        res2.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2])
        # We don't return to Res2 until the end to make sure we aren't co-mingling comments!

        res = index.search(item1.value, include_metadata=True, k=1)

        assert res.data.items is not None
        assert len(res.data.items) == 1
        assert res.data.items[0].value.value == item1.value
        assert res.data.items[0].value.external_id == item1.external_id
        assert res.data.items[0].value.external_type == item1.external_type
        _list_equal(res.data.items[0].value.metadata, item1.metadata)

        res.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2])

        comments = res.task.list_comments()
        assert len(comments.data.comments) == 1

        comment = comments.data.comments[0]
        assert comment.external_id == "Foo"
        assert comment.external_type == "Bar"
        _list_equal(comment.metadata, [1, 2])

        comment.delete()

        comments = res.task.list_comments()
        assert len(comments.data.comments) == 0

        # Now let's add one
        res.task.add_comment(external_id="Foo1", external_type="Bar1", metadata=[1, 2, 3])
        res.task.add_comment(external_id="Foo2", external_type="Bar2", metadata=[1, 2, 3, 4])

        comments = res.task.list_comments()
        assert len(comments.data.comments) == 2

        comment = comments.data.comments[0]
        assert comment.external_id == "Foo1"
        assert comment.external_type == "Bar1"
        _list_equal(comment.metadata, [1, 2, 3])

        comment = comments.data.comments[1]
        assert comment.external_id == "Foo2"
        assert comment.external_type == "Bar2"
        _list_equal(comment.metadata, [1, 2, 3, 4])

        comments.data.comments[0].delete()
        comments.data.comments[1].delete()

        comments = res.task.list_comments()
        assert len(comments.data.comments) == 0

        # Now we handle res2
        comments = res2.task.list_comments()
        assert len(comments.data.comments) == 1
        comment = comments.data.comments[0]
        assert comment.external_id == "Foo"
        assert comment.external_type == "Bar"
        _list_equal(comment.metadata, [1, 2])
        comments.data.comments[0].delete()
        comments = res.task.list_comments()
        assert len(comments.data.comments) == 0