Пример #1
0
def test_multiple_queries():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        # Test for suppressed re-indexing
        a1 = "Ted can eat an entire block of cheese."
        a2 = "Joe can drink an entire glass of water."
        _ = index.insert_many([a1, a2])
        index.embed().wait()

        qs1 = ["Who can eat the most cheese", "Who can run the fastest?"]
        search_results = index.search(qs1)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs1[0]

        qs2 = ["Who can tie a shoe?", "Who can drink the most water?"]
        search_results = index.search(qs2)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a2
        assert search_results.data.items[0].value.query == qs2[1]

        qs3 = ["What can Ted do?", "What can Sam do?", "What can Jerry do?"]
        search_results = index.search(qs3)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs3[0]

        qs3 = ["What can Sam do?", "What can Ted do?", "What can Jerry do?"]
        search_results = index.search(qs3)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1
        assert search_results.data.items[0].value.query == qs3[1]

        index.create_snapshot().wait()

        a3 = "Susan can run very fast."
        a4 = "Brenda can fight alligators."
        _ = index.insert_many([a3, a4])
        index.embed().wait()

        qs4 = ["What can Brenda do?", "What can Ronaldo do?", "What can Jerry do?"]
        search_results = index.search(qs4)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a4
        assert search_results.data.items[0].value.query == qs4[0]

        qs4 = [
            "What can Brenda do?",
            "Who should run a marathon?",
            "What can Jerry do?",
        ]
        search_results = index.search(qs4, k=2)
        assert len(search_results.data.items) == 2
        assert search_results.data.items[0].value.value == a4
        assert search_results.data.items[0].value.query == qs4[0]
        assert search_results.data.items[1].value.value == a3
        assert search_results.data.items[1].value.query == qs4[1]
Пример #2
0
def test_index_usage():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        a1 = "Ted can eat an entire block of cheese."
        q1 = "Who can eat the most cheese"
        _ = index.insert(a1)
        _ = index.search(q1)

        # Now embed
        task = index.embed()
        task.wait()
        task.refresh()
        assert task.task.state == TaskState.succeeded

        search_results = index.search(q1)
        assert len(search_results.data.items) == 1
        assert search_results.data.items[0].value.value == a1

        # Associate metadata
        a2 = "Armadillo shells are bulletproof."
        q2 = "What is something interesting about Armadillos?"
        a2id = "A2id"
        a2type = "A2type"
        a2metadata = dict(
            id=a2id,
            idid=f"{a2id}{a2id}",
            boolVal=True,
            intVal=123,
            floatVal=1.2,
        )

        _ = index.insert(a2, external_id=a2id, external_type=a2type, metadata=a2metadata)
        search_results2 = index.search(q2)
        assert len(search_results2.data.items) == 1
        assert search_results2.data.items[0].value.value == a2
        assert search_results2.data.items[0].value.external_id is None
        assert search_results2.data.items[0].value.external_type is None
        assert search_results2.data.items[0].value.metadata is None

        search_results3 = index.search(q2, include_metadata=True)
        assert len(search_results3.data.items) == 1
        assert search_results3.data.items[0].value.value == a2
        assert search_results3.data.items[0].value.external_id == a2id
        assert search_results3.data.items[0].value.external_type == a2type

        assert search_results3.data.items[0].value.metadata == a2metadata
        # Because I don't know pytest enough to fully trust the dict comparison..
        assert search_results3.data.items[0].value.metadata["id"] == a2id
        assert search_results3.data.items[0].value.metadata["idid"] == "{}{}".format(a2id, a2id)

        search_results4 = index.search(q2, k=10)
        assert len(search_results4.data.items) == 2
        assert search_results4.data.items[0].value.value == a2
        assert search_results4.data.items[1].value.value == a1
Пример #3
0
def test_duplicate_inserts():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        # Test for suppressed re-indexing
        a1 = "Ted can eat an entire block of cheese."
        q1 = "Who can eat the most cheese"
        _ = index.insert(a1)
        _ = index.search(q1)
Пример #4
0
def test_e2e_corpus_export_with_query(client):
    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    a = File.create(
        client=client,
        blocks=[
            Block.CreateRequest(text="A",
                                tags=[Tag.CreateRequest(name="BlockTag")]),
            Block.CreateRequest(text="B"),
        ],
    ).data
    assert a.id is not None
    b = File.create(
        client=client,
        blocks=[Block.CreateRequest(text="A"),
                Block.CreateRequest(text="B")],
        tags=[Tag.CreateRequest(name="FileTag")],
    ).data
    assert b.id is not None

    # Now export the corpus
    _input = ExportPluginInput(query='filetag and name "FileTag"', type="file")
    raw_data_r = exporter_plugin.export(_input)
    assert raw_data_r is not None

    # The results of a corpus exporter are MD5 encoded!
    raw_data_r.wait()
    raw_data = raw_data_r.data.data
    # decode base64 to get URL at url json property
    decoded_data = json.loads(base64.b64decode(raw_data))
    url = decoded_data["url"]

    # fetch the URL via requests.get
    content = requests.get(url).text

    # Look at lines of jsonl file
    files = [File.parse_obj(json.loads(line)) for line in content.splitlines()]
    assert len(files) == 1
    assert len(files[0].tags) == 1

    a.delete()
    b.delete()
Пример #5
0
def deploy_plugin(
    client: Steamship,
    py_path: Path,
    plugin_type: str,
    training_platform: Optional[HostingType] = None,
    version_config_template: Dict[str, Any] = None,
    instance_config: Dict[str, Any] = None,
    space_id: Optional[str] = None,
):
    plugin = Plugin.create(
        client,
        training_platform=training_platform,
        type_=plugin_type,
        transport="jsonOverHttp",
        description="A Plugin (python client tests)",
        is_public=False,
    )
    assert plugin.error is None
    assert plugin.data is not None
    plugin = plugin.data

    zip_bytes = zip_deployable(py_path)
    version = PluginVersion.create(
        client,
        "test-version",
        plugin_id=plugin.id,
        filebytes=zip_bytes,
        config_template=version_config_template,
    )
    # TODO: This is due to having to wait for the lambda to finish deploying.
    # TODO: We should update the task system to allow its .wait() to depend on this.
    version = _wait_for_version(version)

    instance = PluginInstance.create(
        client,
        space_id=space_id,
        plugin_id=plugin.id,
        plugin_version_id=version.id,
        config=instance_config,
    )
    instance = _wait_for_instance(instance)

    assert instance.plugin_id == plugin.id
    assert instance.plugin_version_id == version.id

    _check_user(client, instance)

    yield plugin, version, instance

    _delete_deployable(instance, version, plugin)
Пример #6
0
def test_embed_task():
    steamship = get_steamship_client()
    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        _ = index.insert("test", reindex=False)
        res = index.embed()

        assert res.task.task_id is not None
        assert res.task.state is not None
        assert res.task.task_created_on is not None
        assert res.task.task_last_modified_on is not None
        assert res.task.state == TaskState.waiting
        res.wait()
        assert res.task.state == TaskState.succeeded
Пример #7
0
def test_e2e_corpus_export(client: Steamship):
    version_config_template = dict(
        text_column=dict(type="string"),
        tag_columns=dict(type="string"),
        tag_kind=dict(type="string"),
    )  # TODO (enias): Derive this from Config
    instance_config = dict(  # Has to match up
        text_column="Message",
        tag_columns="Category",
        tag_kind="Intent",
    )
    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    _input = ExportPluginInput(handle="default", type="file")

    csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py"

    # Make a blockifier which will generate our trainable corpus
    with deploy_plugin(
            client,
            csv_blockifier_path,
            "blockifier",
            version_config_template=version_config_template,
            instance_config=instance_config,
    ) as (plugin, version, instance):
        with upload_file(client, "utterances.csv") as file:
            assert len(file.refresh().data.blocks) == 0
            # Use the plugin we just registered
            file.blockify(plugin_instance=instance.handle).wait()
            assert len(file.refresh().data.blocks) == 5

            # Now export the corpus
            raw_data_r = exporter_plugin.export(_input)
            assert raw_data_r is not None

            # The results of a corpus exporter are MD5 encoded!
            _ = raw_data_r.data
Пример #8
0
def test_delete_index():
    steamship = get_steamship_client()
    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    index = steamship.create_index(plugin_instance=plugin_instance.handle).data
    assert index.id is not None

    task = steamship.create_index(handle=index.handle, plugin_instance=plugin_instance.handle)
    assert task.error is None
    index2 = task.data
    assert index.id == index2.id

    index.delete()

    task = steamship.create_index(plugin_instance=plugin_instance.handle)
    assert task.error is None
    assert task.data is not None
    index3 = task.data
    assert index.id != index3.id
    index3.delete()
Пример #9
0
def test_insert_many():
    steamship = get_steamship_client()
    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        item1 = EmbeddedItem(
            value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3]
        )
        item2 = EmbeddedItem(
            value="Rocket Ship",
            external_id="space",
            external_type="vehicle",
            metadata="Foo",
        )

        index.insert_many([item1, item2])
        index.embed().wait()

        task = index.list_items()
        assert task.error is None
        index_items = task.data
        assert len(index_items.items) == 2
        assert len(index_items.items[0].embedding) > 0
        assert len(index_items.items[1].embedding) > 0
        assert len(index_items.items[0].embedding) == len(index_items.items[1].embedding)

        res = index.search(item1.value, include_metadata=True, k=100)
        assert res.data.items is not None
        assert len(res.data.items) == 2
        assert res.data.items[0].value.value == item1.value
        assert res.data.items[0].value.external_id == item1.external_id
        assert res.data.items[0].value.external_type == item1.external_type
        _list_equal(res.data.items[0].value.metadata, item1.metadata)

        res = index.search(item2.value, include_metadata=True)
        assert res.data.items is not None
        assert res.data.items[0].value.value == item2.value
        assert res.data.items[0].value.external_id == item2.external_id
        assert res.data.items[0].value.external_type == item2.external_type
        assert res.data.items[0].value.metadata == item2.metadata
Пример #10
0
def test_empty_queries():
    steamship = get_steamship_client()

    plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data
    with random_index(steamship, plugin_instance.handle) as index:
        a1 = "Ted can eat an entire block of cheese."
        a2 = "Joe can drink an entire glass of water."
        _ = index.insert_many([a1, a2])
        index.embed().wait()

        search_results = index.search(None)
        assert search_results.error is not None

        # These technically don't count as empty. Leaving this test in here
        # to encode and capture that in case we want to change it.
        search_results = index.search([])
        # noinspection PyUnresolvedReferences
        assert len(search_results.data.items) == 0

        search_results = index.search("")
        # noinspection PyUnresolvedReferences
        assert len(search_results.data.items) == 1
def test_e2e_third_party_trainable_tagger_lambda_training():
    client = get_steamship_client()
    spaceR = Space.get(client)  # TODO (enias): Remove
    assert spaceR.data is not None

    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,  # Don't care if it already exists
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    third_party_trainable_tagger_path = (
        PLUGINS_PATH / "taggers" / "plugin_third_party_trainable_tagger.py")

    # Note that we're going to do the below training on ZERO data for simplicity.
    # The particular test model doesn't actually incorporate any data given to it at training time, so
    # it would just slow the test down to create, blockify, and export a training corpus.

    with deploy_plugin(
            client,
            third_party_trainable_tagger_path,
            "tagger",
            training_platform=HostingType.LAMBDA) as (tagger, tagger_version,
                                                      tagger_instance):
        # Now train the plugin
        training_request = TrainingParameterPluginInput(
            plugin_instance=tagger_instance.handle,
            export_plugin_input=ExportPluginInput(
                plugin_instance=exporter_plugin.handle,
                type="file",
                query="all"),
        )
        train_result = tagger_instance.train(training_request)
        train_result.wait()
        assert train_result.data is not None
        output = train_result.data
        assert output.training_complete
        assert output.training_reference_data is not None
        assert output.training_reference_data["num_checkins"] == 3

        logging.info("Waiting 15 seconds for instance to deploy.")
        import time

        time.sleep(15)

        # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag every block with
        # the parameters `MockClient.LABELS`

        # First we'll create a file
        test_doc = "Hi there"
        res = tagger_instance.tag(doc=test_doc)
        res.wait()
        assert res.error is None
        assert res.data is not None
        assert res.data.file is not None
        assert not res.data.file.tags
        assert res.data.file.blocks is not None
        assert len(res.data.file.blocks) > 0
        for block in res.data.file.blocks:
            assert block.tags is not None
            assert sorted([tag.name
                           for tag in block.tags]) == sorted(MockClient.LABELS)
Пример #12
0
def test_e2e_trainable_tagger_lambda_training(client: Steamship):

    version_config_template = dict(
        text_column=dict(type="string"),
        tag_columns=dict(type="string"),
        tag_kind=dict(type="string"),
    )
    instance_config = dict(text_column="Message", tag_columns="Category", tag_kind="Intent")

    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py"
    trainable_tagger_path = PLUGINS_PATH / "taggers" / "plugin_trainable_tagger.py"

    # Make a blockifier which will generate our trainable corpus
    with deploy_plugin(
        client,
        csv_blockifier_path,
        "blockifier",
        version_config_template=version_config_template,
        instance_config=instance_config,
    ) as (plugin, version, instance):
        with upload_file(client, "utterances.csv") as file:
            assert len(file.refresh().data.blocks) == 0
            # Use the plugin we just registered
            file.blockify(plugin_instance=instance.handle).wait()
            assert len(file.refresh().data.blocks) == 5

            # Now make a trainable tagger to train on those tags
            with deploy_plugin(
                client, trainable_tagger_path, "tagger", training_platform=HostingType.LAMBDA
            ) as (tagger, tagger_version, tagger_instance):
                # Now train the plugin
                training_request = TrainingParameterPluginInput(
                    plugin_instance=tagger_instance.handle,
                    export_plugin_input=ExportPluginInput(
                        plugin_instance=EXPORTER_HANDLE, type="file", query='kind "foo1"'
                    ),
                    training_params=dict(
                        keyword_list=KEYWORDS  # This is a key defined by the test model we're training
                    ),
                )

                train_result = tagger_instance.train(training_request)
                train_result.wait()

                # At this point, the PluginInstance will have written a parameter file to disk. We should be able to
                # retrieve it since we know that it is tagged as the `default`.

                checkpoint = ModelCheckpoint(
                    client=client,
                    handle="default",
                    plugin_instance_id=tagger_instance.id,
                )
                checkpoint_path = checkpoint.download_model_bundle()
                assert checkpoint_path.exists()
                keyword_path = Path(checkpoint_path) / TestTrainableTaggerModel.KEYWORD_LIST_FILE
                assert keyword_path.exists()
                with open(keyword_path, "r") as f:
                    params = json.loads(f.read())
                    assert params == KEYWORDS

                logging.info("Waiting 15 seconds for instance to deploy.")
                import time

                time.sleep(15)

                # If we're here, we have verified that the plugin instance has correctly recorded its parameters
                # into the pluginData bucket under a path unique to the PluginInstnace/ModelCheckpoint.

                # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag any file with the
                # tags that parameter it. Since those tags are (see above) ["product", "coupon"] we should expect
                # this tagger to apply those tags to any file provided to it.

                # First we'll create a file
                test_doc = "Hi there"
                res = tagger_instance.tag(doc=test_doc)
                res.wait()
                assert res.error is None
                assert res.data is not None
                assert res.data.file is not None
                assert res.data.file.tags is not None
                assert len(res.data.file.tags) == len(KEYWORDS)
                assert sorted([tag.name for tag in res.data.file.tags]) == sorted(KEYWORDS)
Пример #13
0
def test_create_index():
    client = get_steamship_client()
    plugin_instance = PluginInstance.create(client, plugin_handle=_TEST_EMBEDDER).data
    create_index(client, plugin_instance.handle)