예제 #1
0
 def setUp(self) -> None:
     self.model_config = BertModelConfig(
         model_id="bert_tiny",
         query_input_size=4,
         doc_input_size=8,
         tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                "bert_tiny_tokenizer"),
     )
예제 #2
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     self.app_package = ApplicationPackage(name="cord19")
     self.app_package.schema.add_fields(
         Field(name="cord_uid", type="string", indexing=["attribute", "summary"]),
         Field(
             name="title",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
     )
     self.app_package.schema.add_field_set(
         FieldSet(name="default", fields=["title"])
     )
     self.app_package.schema.add_rank_profile(
         RankProfile(name="bm25", first_phase="bm25(title)")
     )
     self.bert_config = BertModelConfig(
         model_id="pretrained_bert_tiny",
         tokenizer="google/bert_uncased_L-2_H-128_A-2",
         model="google/bert_uncased_L-2_H-128_A-2",
         query_input_size=5,
         doc_input_size=10,
     )
     self.app_package.add_model_ranking(
         model_config=self.bert_config,
         include_model_summary_features=True,
         inherits="default",
         first_phase="bm25(title)",
         second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"),
     )
     #
     # Deploy on Vespa Cloud
     #
     self.vespa_cloud = VespaCloud(
         tenant="vespa-team",
         application="pyvespa-integration",
         key_content=os.getenv("VESPA_CLOUD_USER_KEY").replace(r"\n", "\n"),
         application_package=self.app_package,
     )
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application")
     self.instance_name = "test"
     self.app = self.vespa_cloud.deploy(
         instance=self.instance_name, disk_folder=self.disk_folder
     )
예제 #3
0
def create_cord19_application_package():
    app_package = ApplicationPackage(name="cord19")
    app_package.schema.add_fields(
        Field(name="id", type="string", indexing=["attribute", "summary"]),
        Field(
            name="title",
            type="string",
            indexing=["index", "summary"],
            index="enable-bm25",
        ),
    )
    app_package.schema.add_field_set(FieldSet(name="default",
                                              fields=["title"]))
    app_package.schema.add_rank_profile(
        RankProfile(name="bm25", first_phase="bm25(title)"))
    bert_config = BertModelConfig(
        model_id="pretrained_bert_tiny",
        tokenizer="google/bert_uncased_L-2_H-128_A-2",
        model="google/bert_uncased_L-2_H-128_A-2",
        query_input_size=5,
        doc_input_size=10,
    )
    app_package.add_model_ranking(
        model_config=bert_config,
        include_model_summary_features=True,
        inherits="default",
        first_phase="bm25(title)",
        second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"),
    )
    return app_package
예제 #4
0
    def setUp(self) -> None:
        self.app_package = ApplicationPackage(name="test_app")

        self.app_package.schema.add_fields(
            Field(name="id", type="string", indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
            Field(
                name="body",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title", "body"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="default", first_phase="nativeRank(title, body)"))
        self.app_package.schema.add_rank_profile(
            RankProfile(
                name="bm25",
                first_phase="bm25(title) + bm25(body)",
                inherits="default",
            ))
        self.app_package.query_profile_type.add_fields(
            QueryTypeField(
                name="ranking.features.query(query_bert)",
                type="tensor<float>(x[768])",
            ))
        self.app_package.query_profile.add_fields(
            QueryField(name="maxHits", value=100),
            QueryField(name="anotherField", value="string_value"),
        )

        bert_config = BertModelConfig(
            model_id="bert_tiny",
            query_input_size=4,
            doc_input_size=8,
            tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                   "bert_tiny_tokenizer"),
            model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"),
        )

        self.app_package.add_model_ranking(
            model_config=bert_config,
            include_model_summary_features=True,
            inherits="default",
            first_phase="bm25(title)",
            second_phase=SecondPhaseRanking(rerank_count=10,
                                            expression="logit1"),
        )
예제 #5
0
class TestBertModelConfig(unittest.TestCase):
    def setUp(self) -> None:
        self.model_config = BertModelConfig(
            model_id="bert_tiny",
            query_input_size=4,
            doc_input_size=8,
            tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                   "bert_tiny_tokenizer"),
            model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"),
        )

    def test_serialization(self):
        self.assertEqual(self.model_config,
                         BertModelConfig.from_dict(self.model_config.to_dict))

    def test_predict(self):
        prediction = self.model_config.predict(
            queries=["this is one query", "this is another query"],
            docs=["this is one document", "this is another document"],
        )
        self.assertEqual(len(prediction), 2)
        self.assertEqual(len(prediction[0]), 2)
        self.assertEqual(len(prediction[1]), 2)

    def _predict_with_onnx(self, onnx_file_path, model_inputs):
        os.environ[
            "KMP_DUPLICATE_LIB_OK"] = "True"  # required to run on mac https://stackoverflow.com/a/53014308
        m = InferenceSession(onnx_file_path)
        (out, ) = m.run(input_feed=model_inputs, output_names=["output_0"])
        return out

    def test_export_to_onnx(self):
        output_path = "test_model.onnx"
        self.model_config.export_to_onnx(output_path=output_path)
        model_inputs = self.model_config.create_encodings(
            queries=["this is a query"], docs=["this is a document"])
        assert_almost_equal(
            self._predict_with_onnx(output_path, model_inputs),
            self.model_config.predict(queries=["this is a query"],
                                      docs=["this is a document"]),
        )
        os.remove(output_path)
예제 #6
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     self.app_package = ApplicationPackage(name="cord19")
     self.app_package.schema.add_fields(
         Field(name="cord_uid",
               type="string",
               indexing=["attribute", "summary"]),
         Field(
             name="title",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
     )
     self.app_package.schema.add_field_set(
         FieldSet(name="default", fields=["title"]))
     self.app_package.schema.add_rank_profile(
         RankProfile(name="bm25", first_phase="bm25(title)"))
     self.bert_config = BertModelConfig(
         model_id="pretrained_bert_tiny",
         tokenizer="google/bert_uncased_L-2_H-128_A-2",
         model="google/bert_uncased_L-2_H-128_A-2",
         query_input_size=5,
         doc_input_size=10,
     )
     self.app_package.add_model_ranking(
         model_config=self.bert_config,
         include_model_summary_features=True,
         inherits="default",
         first_phase="bm25(title)",
         second_phase=SecondPhaseRanking(rerank_count=10,
                                         expression="logit1"),
     )
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"),
                                     "sample_application")
     self.vespa_docker = VespaDocker(port=8089)
     self.app = self.vespa_docker.deploy(
         application_package=self.app_package, disk_folder=self.disk_folder)
예제 #7
0
 def test_serialization(self):
     self.assertEqual(self.model_config,
                      BertModelConfig.from_dict(self.model_config.to_dict))
예제 #8
0
class TestBertModelConfigTokenizerOnly(unittest.TestCase):
    def setUp(self) -> None:
        self.model_config = BertModelConfig(
            model_id="bert_tiny",
            query_input_size=4,
            doc_input_size=8,
            tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                   "bert_tiny_tokenizer"),
        )

    def test_serialization(self):
        self.assertEqual(self.model_config,
                         BertModelConfig.from_dict(self.model_config.to_dict))

    def test_add_model(self):
        self.assertIsNone(self.model_config.model)
        self.assertIsNone(self.model_config._model)
        self.model_config.add_model(
            model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"))
        self.assertEqual(
            self.model_config.model,
            os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"),
        )
        self.assertIsInstance(self.model_config._model,
                              BertForSequenceClassification)

    def test_doc_fields(self):
        self.assertDictEqual(
            self.model_config.doc_fields(text="this is a test"),
            {
                "bert_tiny_doc_token_ids": {
                    "values": [2023, 2003, 1037, 3231, 0, 0, 0]
                }
            },
        )

    def test_query_tensor_mapping(self):
        self.assertEqual(
            self.model_config.query_tensor_mapping(text="this is a test"),
            [2023, 2003])

    def test_create_encodings(self):
        self.assertDictEqual(
            self.model_config.create_encodings(
                queries=["this is one query", "this is another query"],
                docs=["this is one document", "this is another document"],
            ),
            {
                "input_ids": [
                    [
                        101, 2023, 2003, 102, 2023, 2003, 2028, 6254, 102, 0,
                        0, 0
                    ],
                    [
                        101, 2023, 2003, 102, 2023, 2003, 2178, 6254, 102, 0,
                        0, 0
                    ],
                ],
                "token_type_ids": [
                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
                ],
                "attention_mask": [
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
                ],
            },
        )

    def test_create_encodings_with_tensors(self):
        encodings = self.model_config.create_encodings(
            queries=["this is one query", "this is another query"],
            docs=["this is one document", "this is another document"],
            return_tensors=True,
        )
        self.assertIsInstance(encodings["input_ids"], Tensor)
        self.assertIsInstance(encodings["token_type_ids"], Tensor)
        self.assertIsInstance(encodings["attention_mask"], Tensor)

    def test_predict(self):
        with self.assertRaises(ValueError):
            self.model_config.predict(
                queries=["this is one query", "this is another query"],
                docs=["this is one document", "this is another document"],
            )

    def test_export_to_onnx(self):
        with self.assertRaises(ValueError):
            self.model_config.export_to_onnx(output_path="test_model.onnx")
예제 #9
0
class TestOnnxModelDockerDeployment(unittest.TestCase):
    def setUp(self) -> None:
        #
        # Create application package
        #
        self.app_package = ApplicationPackage(name="cord19")
        self.app_package.schema.add_fields(
            Field(name="cord_uid",
                  type="string",
                  indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="bm25", first_phase="bm25(title)"))
        self.bert_config = BertModelConfig(
            model_id="pretrained_bert_tiny",
            tokenizer="google/bert_uncased_L-2_H-128_A-2",
            model="google/bert_uncased_L-2_H-128_A-2",
            query_input_size=5,
            doc_input_size=10,
        )
        self.app_package.add_model_ranking(
            model_config=self.bert_config,
            include_model_summary_features=True,
            inherits="default",
            first_phase="bm25(title)",
            second_phase=SecondPhaseRanking(rerank_count=10,
                                            expression="logit1"),
        )
        self.disk_folder = os.path.join(os.getenv("WORK_DIR"),
                                        "sample_application")
        self.vespa_docker = VespaDocker(port=8089)
        self.app = self.vespa_docker.deploy(
            application_package=self.app_package, disk_folder=self.disk_folder)

    def test_deploy(self):
        self.assertTrue(
            any(
                re.match("Generation: [0-9]+", line)
                for line in self.app.deployment_message))
        self.assertEqual(self.app.get_application_status().status_code, 200)

    def test_data_operation(self):
        #
        # Get data that does not exist
        #
        self.assertEqual(
            self.app.get_data(schema="cord19", data_id="1").status_code, 404)
        #
        # Feed a data point
        #
        fields = {
            "cord_uid": "1",
            "title": "this is my first title",
        }
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.feed_data_point(
            schema="cord19",
            data_id="1",
            fields=fields,
        )
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Get data that exist
        #
        response = self.app.get_data(schema="cord19", data_id="1")
        self.assertEqual(response.status_code, 200)
        embedding_values = fields["pretrained_bert_tiny_doc_token_ids"][
            "values"]
        self.assertDictEqual(
            response.json(),
            {
                "fields": {
                    "cord_uid": "1",
                    "title": "this is my first title",
                    "pretrained_bert_tiny_doc_token_ids": {
                        "cells": [{
                            "address": {
                                "d0": str(x)
                            },
                            "value": float(embedding_values[x]),
                        } for x in range(len(embedding_values))]
                    },
                },
                "id": "id:cord19:cord19::1",
                "pathId": "/document/v1/cord19/cord19/docid/1",
            },
        )
        #
        # Update data
        #
        fields = {"title": "this is my updated title"}
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.update_data(schema="cord19",
                                        data_id="1",
                                        fields=fields)
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Get the updated data point
        #
        response = self.app.get_data(schema="cord19", data_id="1")
        self.assertEqual(response.status_code, 200)
        embedding_values = fields["pretrained_bert_tiny_doc_token_ids"][
            "values"]
        self.assertDictEqual(
            response.json(),
            {
                "fields": {
                    "cord_uid": "1",
                    "title": "this is my updated title",
                    "pretrained_bert_tiny_doc_token_ids": {
                        "cells": [{
                            "address": {
                                "d0": str(x)
                            },
                            "value": float(embedding_values[x]),
                        } for x in range(len(embedding_values))]
                    },
                },
                "id": "id:cord19:cord19::1",
                "pathId": "/document/v1/cord19/cord19/docid/1",
            },
        )
        #
        # Delete a data point
        #
        response = self.app.delete_data(schema="cord19", data_id="1")
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Deleted data should be gone
        #
        self.assertEqual(
            self.app.get_data(schema="cord19", data_id="1").status_code, 404)

    def _parse_vespa_tensor(self, hit, feature):
        return [
            x["value"]
            for x in hit["fields"]["summaryfeatures"][feature]["cells"]
        ]

    def test_rank_input_output(self):
        #
        # Feed a data point
        #
        fields = {
            "cord_uid": "1",
            "title": "this is my first title",
        }
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.feed_data_point(
            schema="cord19",
            data_id="1",
            fields=fields,
        )
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Run a test query
        #
        result = self.app.query(
            query="this is a test",
            query_model=QueryModel(
                query_properties=[
                    QueryRankingFeature(
                        name=self.bert_config.query_token_ids_name,
                        mapping=self.bert_config.query_tensor_mapping,
                    )
                ],
                match_phase=OR(),
                rank_profile=Ranking(name="pretrained_bert_tiny"),
            ),
        )
        vespa_input_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(input_ids)")
        vespa_attention_mask = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(attention_mask)")
        vespa_token_type_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(token_type_ids)")

        expected_inputs = self.bert_config.create_encodings(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0])
        self.assertEqual(vespa_attention_mask,
                         expected_inputs["attention_mask"][0])
        self.assertEqual(vespa_token_type_ids,
                         expected_inputs["token_type_ids"][0])

        expected_logits = self.bert_config.predict(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit0)"],
            expected_logits[0][0],
            5,
        )
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit1)"],
            expected_logits[0][1],
            5,
        )

    def tearDown(self) -> None:
        shutil.rmtree(self.disk_folder, ignore_errors=True)
        self.vespa_docker.container.stop()
        self.vespa_docker.container.remove()