예제 #1
0
def create_cord19_application_package():
    app_package = ApplicationPackage(name="cord19")
    app_package.schema.add_fields(
        Field(name="id", type="string", indexing=["attribute", "summary"]),
        Field(
            name="title",
            type="string",
            indexing=["index", "summary"],
            index="enable-bm25",
        ),
    )
    app_package.schema.add_field_set(FieldSet(name="default",
                                              fields=["title"]))
    app_package.schema.add_rank_profile(
        RankProfile(name="bm25", first_phase="bm25(title)"))
    bert_config = BertModelConfig(
        model_id="pretrained_bert_tiny",
        tokenizer="google/bert_uncased_L-2_H-128_A-2",
        model="google/bert_uncased_L-2_H-128_A-2",
        query_input_size=5,
        doc_input_size=10,
    )
    app_package.add_model_ranking(
        model_config=bert_config,
        include_model_summary_features=True,
        inherits="default",
        first_phase="bm25(title)",
        second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"),
    )
    return app_package
예제 #2
0
    def setUp(self) -> None:
        self.app_package = ApplicationPackage(name="test_app")

        self.app_package.schema.add_fields(
            Field(name="id", type="string", indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
            Field(
                name="body",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title", "body"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="default", first_phase="nativeRank(title, body)"))
        self.app_package.schema.add_rank_profile(
            RankProfile(
                name="bm25",
                first_phase="bm25(title) + bm25(body)",
                inherits="default",
            ))
        self.app_package.query_profile_type.add_fields(
            QueryTypeField(
                name="ranking.features.query(query_bert)",
                type="tensor<float>(x[768])",
            ))
        self.app_package.query_profile.add_fields(
            QueryField(name="maxHits", value=100),
            QueryField(name="anotherField", value="string_value"),
        )

        bert_config = BertModelConfig(
            model_id="bert_tiny",
            query_input_size=4,
            doc_input_size=8,
            tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                   "bert_tiny_tokenizer"),
            model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"),
        )

        self.app_package.add_model_ranking(
            model_config=bert_config,
            include_model_summary_features=True,
            inherits="default",
            first_phase="bm25(title)",
            second_phase=SecondPhaseRanking(rerank_count=10,
                                            expression="logit1"),
        )
예제 #3
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     self.app_package = ApplicationPackage(name="cord19")
     self.app_package.schema.add_fields(
         Field(name="cord_uid", type="string", indexing=["attribute", "summary"]),
         Field(
             name="title",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
     )
     self.app_package.schema.add_field_set(
         FieldSet(name="default", fields=["title"])
     )
     self.app_package.schema.add_rank_profile(
         RankProfile(name="bm25", first_phase="bm25(title)")
     )
     self.bert_config = BertModelConfig(
         model_id="pretrained_bert_tiny",
         tokenizer="google/bert_uncased_L-2_H-128_A-2",
         model="google/bert_uncased_L-2_H-128_A-2",
         query_input_size=5,
         doc_input_size=10,
     )
     self.app_package.add_model_ranking(
         model_config=self.bert_config,
         include_model_summary_features=True,
         inherits="default",
         first_phase="bm25(title)",
         second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"),
     )
     #
     # Deploy on Vespa Cloud
     #
     self.vespa_cloud = VespaCloud(
         tenant="vespa-team",
         application="pyvespa-integration",
         key_content=os.getenv("VESPA_CLOUD_USER_KEY").replace(r"\n", "\n"),
         application_package=self.app_package,
     )
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application")
     self.instance_name = "test"
     self.app = self.vespa_cloud.deploy(
         instance=self.instance_name, disk_folder=self.disk_folder
     )
예제 #4
0
 def setUp(self) -> None:
     test_schema = Schema(
         name="msmarco",
         document=Document(
             fields=[
                 Field(name="id", type="string", indexing=["attribute", "summary"]),
                 Field(
                     name="title",
                     type="string",
                     indexing=["index", "summary"],
                     index="enable-bm25",
                 ),
                 Field(
                     name="body",
                     type="string",
                     indexing=["index", "summary"],
                     index="enable-bm25",
                 ),
             ]
         ),
         fieldsets=[FieldSet(name="default", fields=["title", "body"])],
         rank_profiles=[
             RankProfile(name="default", first_phase="nativeRank(title, body)"),
             RankProfile(
                 name="bm25",
                 first_phase="bm25(title) + bm25(body)",
                 inherits="default",
             ),
         ],
     )
     self.app_package = ApplicationPackage(name="test_app", schema=test_schema)
예제 #5
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     document = Document(
         fields=[
             Field(name="id", type="string", indexing=["attribute", "summary"]),
             Field(
                 name="title",
                 type="string",
                 indexing=["index", "summary"],
                 index="enable-bm25",
             ),
             Field(
                 name="body",
                 type="string",
                 indexing=["index", "summary"],
                 index="enable-bm25",
             ),
             Field(
                 name="metadata",
                 type="string",
                 indexing=["attribute", "summary"],
                 attribute=["fast-search", "fast-access"],
             ),
             Field(
                 name="tensor_field",
                 type="tensor<float>(x[128])",
                 indexing=["attribute"],
                 ann=HNSW(
                     distance_metric="euclidean",
                     max_links_per_node=16,
                     neighbors_to_explore_at_insert=200,
                 ),
             ),
         ]
     )
     msmarco_schema = Schema(
         name="msmarco",
         document=document,
         fieldsets=[FieldSet(name="default", fields=["title", "body"])],
         rank_profiles=[
             RankProfile(name="default", first_phase="nativeRank(title, body)")
         ],
     )
     app_package = ApplicationPackage(name="msmarco", schema=msmarco_schema)
     #
     # Deploy on Vespa Cloud
     #
     self.vespa_cloud = VespaCloud(
         tenant="vespa-team",
         application="pyvespa-integration",
         key_content=os.getenv("VESPA_CLOUD_USER_KEY").replace(r"\n", "\n"),
         application_package=app_package,
     )
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application")
     self.instance_name = "test"
     self.app = self.vespa_cloud.deploy(
         instance=self.instance_name, disk_folder=self.disk_folder
     )
예제 #6
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     self.app_package = ApplicationPackage(name="cord19")
     self.app_package.schema.add_fields(
         Field(name="cord_uid",
               type="string",
               indexing=["attribute", "summary"]),
         Field(
             name="title",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
     )
     self.app_package.schema.add_field_set(
         FieldSet(name="default", fields=["title"]))
     self.app_package.schema.add_rank_profile(
         RankProfile(name="bm25", first_phase="bm25(title)"))
     self.bert_config = BertModelConfig(
         model_id="pretrained_bert_tiny",
         tokenizer="google/bert_uncased_L-2_H-128_A-2",
         model="google/bert_uncased_L-2_H-128_A-2",
         query_input_size=5,
         doc_input_size=10,
     )
     self.app_package.add_model_ranking(
         model_config=self.bert_config,
         include_model_summary_features=True,
         inherits="default",
         first_phase="bm25(title)",
         second_phase=SecondPhaseRanking(rerank_count=10,
                                         expression="logit1"),
     )
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"),
                                     "sample_application")
     self.vespa_docker = VespaDocker(port=8089)
     self.app = self.vespa_docker.deploy(
         application_package=self.app_package, disk_folder=self.disk_folder)
예제 #7
0
    def setUp(self) -> None:
        self.app_package = ApplicationPackage(name="test_app")

        self.app_package.schema.add_fields(
            Field(name="id", type="string", indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
            Field(
                name="body",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
            Field(
                name="tensor_field",
                type="tensor<float>(x[128])",
                indexing=["attribute"],
                attribute=["fast-search", "fast-access"],
                ann=HNSW(
                    distance_metric="euclidean",
                    max_links_per_node=16,
                    neighbors_to_explore_at_insert=200,
                ),
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title", "body"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="default", first_phase="nativeRank(title, body)"))
        self.app_package.schema.add_rank_profile(
            RankProfile(
                name="bm25",
                first_phase="bm25(title) + bm25(body)",
                inherits="default",
            ))
        self.app_package.query_profile_type.add_fields(
            QueryTypeField(
                name="ranking.features.query(query_bert)",
                type="tensor<float>(x[768])",
            ))
        self.app_package.query_profile.add_fields(
            QueryField(name="maxHits", value=100),
            QueryField(name="anotherField", value="string_value"),
        )
예제 #8
0
 def setUp(self) -> None:
     #
     # Create application package
     #
     document = Document(fields=[
         Field(name="id", type="string", indexing=["attribute", "summary"]),
         Field(
             name="title",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
         Field(
             name="body",
             type="string",
             indexing=["index", "summary"],
             index="enable-bm25",
         ),
         Field(
             name="metadata",
             type="string",
             indexing=["attribute", "summary"],
             attribute=["fast-search", "fast-access"],
         ),
         Field(
             name="tensor_field",
             type="tensor<float>(x[128])",
             indexing=["attribute"],
             ann=HNSW(
                 distance_metric="euclidean",
                 max_links_per_node=16,
                 neighbors_to_explore_at_insert=200,
             ),
         ),
     ])
     msmarco_schema = Schema(
         name="msmarco",
         document=document,
         fieldsets=[FieldSet(name="default", fields=["title", "body"])],
         rank_profiles=[
             RankProfile(name="default",
                         first_phase="nativeRank(title, body)")
         ],
     )
     self.app_package = ApplicationPackage(name="msmarco",
                                           schema=msmarco_schema)
     self.disk_folder = os.path.join(os.getenv("WORK_DIR"),
                                     "sample_application")
예제 #9
0
def create_msmarco_application_package():
    #
    # Application package
    #
    document = Document(fields=[
        Field(name="id", type="string", indexing=["attribute", "summary"]),
        Field(
            name="title",
            type="string",
            indexing=["index", "summary"],
            index="enable-bm25",
        ),
        Field(
            name="body",
            type="string",
            indexing=["index", "summary"],
            index="enable-bm25",
        ),
        Field(
            name="metadata",
            type="string",
            indexing=["attribute", "summary"],
            attribute=["fast-search", "fast-access"],
        ),
        Field(
            name="tensor_field",
            type="tensor<float>(x[128])",
            indexing=["attribute", "index"],
            ann=HNSW(
                distance_metric="euclidean",
                max_links_per_node=16,
                neighbors_to_explore_at_insert=200,
            ),
        ),
    ])
    msmarco_schema = Schema(
        name="msmarco",
        document=document,
        fieldsets=[FieldSet(name="default", fields=["title", "body"])],
        rank_profiles=[
            RankProfile(name="default", first_phase="nativeRank(title, body)")
        ],
    )
    app_package = ApplicationPackage(name="msmarco", schema=[msmarco_schema])
    return app_package
예제 #10
0
 def test_application_package(self):
     self.assertEqual(
         self.app_package, ApplicationPackage.from_dict(self.app_package.to_dict)
     )
예제 #11
0
          indexing=["index", "summary"],
          index="enable-bm25"),
    Field(name="body",
          type="string",
          indexing=["index", "summary"],
          index="enable-bm25")
])

from vespa.package import Schema, FieldSet, RankProfile

msmarco_schema = Schema(
    name="msmarco",
    document=document,
    fieldsets=[FieldSet(name="default", fields=["title", "body"])],
    rank_profiles=[
        RankProfile(name="default", first_phase="nativeRank(title, body)")
    ])

from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name="msmarco", schema=msmarco_schema)

from vespa.package import VespaDocker

path = "mnt/c/Users/User/OneDrive - NTNU/NTNU/Prosjekt oppgave NLP/"
name = "virke_denne_gangen/"

app_path = path + name

vespa_docker = VespaDocker()
vespa_docker.deploy(application_package=app_package, disk_folder=app_path)
예제 #12
0
class TestSimplifiedApplicationPackageAddBertRanking(unittest.TestCase):
    def setUp(self) -> None:
        self.app_package = ApplicationPackage(name="test_app")

        self.app_package.schema.add_fields(
            Field(name="id", type="string", indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
            Field(
                name="body",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title", "body"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="default", first_phase="nativeRank(title, body)"))
        self.app_package.schema.add_rank_profile(
            RankProfile(
                name="bm25",
                first_phase="bm25(title) + bm25(body)",
                inherits="default",
            ))
        self.app_package.query_profile_type.add_fields(
            QueryTypeField(
                name="ranking.features.query(query_bert)",
                type="tensor<float>(x[768])",
            ))
        self.app_package.query_profile.add_fields(
            QueryField(name="maxHits", value=100),
            QueryField(name="anotherField", value="string_value"),
        )

        bert_config = BertModelConfig(
            model_id="bert_tiny",
            query_input_size=4,
            doc_input_size=8,
            tokenizer=os.path.join(os.environ["RESOURCES_DIR"],
                                   "bert_tiny_tokenizer"),
            model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"),
        )

        self.app_package.add_model_ranking(
            model_config=bert_config,
            include_model_summary_features=True,
            inherits="default",
            first_phase="bm25(title)",
            second_phase=SecondPhaseRanking(rerank_count=10,
                                            expression="logit1"),
        )

    def test_application_package(self):
        self.assertEqual(
            self.app_package,
            ApplicationPackage.from_dict(self.app_package.to_dict))

    def test_schema_to_text(self):
        expected_result = (
            "schema test_app {\n"
            "    document test_app {\n"
            "        field id type string {\n"
            "            indexing: attribute | summary\n"
            "        }\n"
            "        field title type string {\n"
            "            indexing: index | summary\n"
            "            index: enable-bm25\n"
            "        }\n"
            "        field body type string {\n"
            "            indexing: index | summary\n"
            "            index: enable-bm25\n"
            "        }\n"
            "        field bert_tiny_doc_token_ids type tensor<float>(d0[7]) {\n"
            "            indexing: attribute | summary\n"
            "        }\n"
            "    }\n"
            "    fieldset default {\n"
            "        fields: title, body\n"
            "    }\n"
            "    onnx-model bert_tiny {\n"
            "        file: files/bert_tiny.onnx\n"
            "        input input_ids: input_ids\n"
            "        input token_type_ids: token_type_ids\n"
            "        input attention_mask: attention_mask\n"
            "        output output_0: logits\n"
            "    }\n"
            "    rank-profile default {\n"
            "        first-phase {\n"
            "            expression: nativeRank(title, body)\n"
            "        }\n"
            "    }\n"
            "    rank-profile bm25 inherits default {\n"
            "        first-phase {\n"
            "            expression: bm25(title) + bm25(body)\n"
            "        }\n"
            "    }\n"
            "    rank-profile bert_tiny inherits default {\n"
            "        constants {\n"
            "            TOKEN_NONE: 0\n"
            "            TOKEN_CLS: 101\n"
            "            TOKEN_SEP: 102\n"
            "        }\n"
            "        function question_length() {\n"
            "            expression {\n"
            "                sum(map(query(bert_tiny_query_token_ids), f(a)(a > 0)))\n"
            "            }\n"
            "        }\n"
            "        function doc_length() {\n"
            "            expression {\n"
            "                sum(map(attribute(bert_tiny_doc_token_ids), f(a)(a > 0)))\n"
            "            }\n"
            "        }\n"
            "        function input_ids() {\n"
            "            expression {\n"
            "                tokenInputIds(12, query(bert_tiny_query_token_ids), attribute(bert_tiny_doc_token_ids))\n"
            "            }\n"
            "        }\n"
            "        function attention_mask() {\n"
            "            expression {\n"
            "                tokenAttentionMask(12, query(bert_tiny_query_token_ids), attribute(bert_tiny_doc_token_ids))\n"
            "            }\n"
            "        }\n"
            "        function token_type_ids() {\n"
            "            expression {\n"
            "                tokenTypeIds(12, query(bert_tiny_query_token_ids), attribute(bert_tiny_doc_token_ids))\n"
            "            }\n"
            "        }\n"
            "        function logit0() {\n"
            "            expression {\n"
            "                onnx(bert_tiny).logits{d0:0,d1:0}\n"
            "            }\n"
            "        }\n"
            "        function logit1() {\n"
            "            expression {\n"
            "                onnx(bert_tiny).logits{d0:0,d1:1}\n"
            "            }\n"
            "        }\n"
            "        first-phase {\n"
            "            expression: bm25(title)\n"
            "        }\n"
            "        second-phase {\n"
            "            rerank-count: 10\n"
            "            expression: logit1\n"
            "        }\n"
            "        summary-features {\n"
            "            logit0\n"
            "            logit1\n"
            "            input_ids\n"
            "            attention_mask\n"
            "            token_type_ids\n"
            "        }\n"
            "    }\n"
            "}")
        self.assertEqual(self.app_package.schema_to_text, expected_result)

    def test_hosts_to_text(self):
        expected_result = (
            '<?xml version="1.0" encoding="utf-8" ?>\n'
            "<!-- Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->\n"
            "<hosts>\n"
            '    <host name="localhost">\n'
            "        <alias>node1</alias>\n"
            "    </host>\n"
            "</hosts>")
        self.assertEqual(self.app_package.hosts_to_text, expected_result)

    def test_services_to_text(self):
        expected_result = (
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            '<services version="1.0">\n'
            '    <container id="test_app_container" version="1.0">\n'
            "        <search></search>\n"
            "        <document-api></document-api>\n"
            "    </container>\n"
            '    <content id="test_app_content" version="1.0">\n'
            '        <redundancy reply-after="1">1</redundancy>\n'
            "        <documents>\n"
            '            <document type="test_app" mode="index"></document>\n'
            "        </documents>\n"
            "        <nodes>\n"
            '            <node distribution-key="0" hostalias="node1"></node>\n'
            "        </nodes>\n"
            "    </content>\n"
            "</services>")

        self.assertEqual(self.app_package.services_to_text, expected_result)

    def test_query_profile_to_text(self):
        expected_result = (
            '<query-profile id="default" type="root">\n'
            '    <field name="maxHits">100</field>\n'
            '    <field name="anotherField">string_value</field>\n'
            "</query-profile>")

        self.assertEqual(self.app_package.query_profile_to_text,
                         expected_result)

    def test_query_profile_type_to_text(self):
        expected_result = (
            '<query-profile-type id="root">\n'
            '    <field name="ranking.features.query(query_bert)" type="tensor&lt;float&gt;(x[768])" />\n'
            '    <field name="ranking.features.query(bert_tiny_query_token_ids)" type="tensor&lt;float&gt;(d0[2])" />\n'
            "</query-profile-type>")
        self.assertEqual(self.app_package.query_profile_type_to_text,
                         expected_result)
예제 #13
0
 def setUp(self) -> None:
     test_schema = Schema(
         name="msmarco",
         document=Document(fields=[
             Field(name="id",
                   type="string",
                   indexing=["attribute", "summary"]),
             Field(
                 name="title",
                 type="string",
                 indexing=["index", "summary"],
                 index="enable-bm25",
             ),
             Field(
                 name="body",
                 type="string",
                 indexing=["index", "summary"],
                 index="enable-bm25",
             ),
             Field(
                 name="embedding",
                 type="tensor<float>(x[128])",
                 indexing=["attribute", "summary"],
                 attribute=["fast-search", "fast-access"],
             ),
         ]),
         fieldsets=[FieldSet(name="default", fields=["title", "body"])],
         rank_profiles=[
             RankProfile(name="default",
                         first_phase="nativeRank(title, body)"),
             RankProfile(
                 name="bm25",
                 first_phase="bm25(title) + bm25(body)",
                 inherits="default",
             ),
             RankProfile(
                 name="bert",
                 first_phase="bm25(title) + bm25(body)",
                 second_phase=SecondPhaseRanking(
                     rerank_count=10,
                     expression="sum(onnx(bert).logits{d0:0,d1:0})"),
                 inherits="default",
                 constants={
                     "TOKEN_NONE": 0,
                     "TOKEN_CLS": 101,
                     "TOKEN_SEP": 102
                 },
                 functions=[
                     Function(
                         name="question_length",
                         expression=
                         "sum(map(query(query_token_ids), f(a)(a > 0)))",
                     ),
                     Function(
                         name="doc_length",
                         expression=
                         "sum(map(attribute(doc_token_ids), f(a)(a > 0)))",
                     ),
                     Function(
                         name="input_ids",
                         expression="tensor<float>(d0[1],d1[128])(\n"
                         "    if (d1 == 0,\n"
                         "        TOKEN_CLS,\n"
                         "    if (d1 < question_length + 1,\n"
                         "        query(query_token_ids){d0:(d1-1)},\n"
                         "    if (d1 == question_length + 1,\n"
                         "        TOKEN_SEP,\n"
                         "    if (d1 < question_length + doc_length + 2,\n"
                         "        attribute(doc_token_ids){d0:(d1-question_length-2)},\n"
                         "    if (d1 == question_length + doc_length + 2,\n"
                         "        TOKEN_SEP,\n"
                         "        TOKEN_NONE\n"
                         "    ))))))",
                     ),
                     Function(
                         name="attention_mask",
                         expression="map(input_ids, f(a)(a > 0))",
                     ),
                     Function(
                         name="token_type_ids",
                         expression="tensor<float>(d0[1],d1[128])(\n"
                         "    if (d1 < question_length,\n"
                         "        0,\n"
                         "    if (d1 < question_length + doc_length,\n"
                         "        1,\n"
                         "        TOKEN_NONE\n"
                         "    )))",
                     ),
                 ],
                 summary_features=[
                     "onnx(bert).logits",
                     "input_ids",
                     "attention_mask",
                     "token_type_ids",
                 ],
             ),
         ],
         models=[
             OnnxModel(
                 model_name="bert",
                 model_file_path="bert.onnx",
                 inputs={
                     "input_ids": "input_ids",
                     "token_type_ids": "token_type_ids",
                     "attention_mask": "attention_mask",
                 },
                 outputs={"logits": "logits"},
             )
         ],
     )
     test_query_profile_type = QueryProfileType(fields=[
         QueryTypeField(
             name="ranking.features.query(query_bert)",
             type="tensor<float>(x[768])",
         )
     ])
     test_query_profile = QueryProfile(fields=[
         QueryField(name="maxHits", value=100),
         QueryField(name="anotherField", value="string_value"),
     ])
     self.app_package = ApplicationPackage(
         name="test_app",
         schema=test_schema,
         query_profile=test_query_profile,
         query_profile_type=test_query_profile_type,
     )
예제 #14
0
class TestOnnxModelDockerDeployment(unittest.TestCase):
    def setUp(self) -> None:
        #
        # Create application package
        #
        self.app_package = ApplicationPackage(name="cord19")
        self.app_package.schema.add_fields(
            Field(name="cord_uid",
                  type="string",
                  indexing=["attribute", "summary"]),
            Field(
                name="title",
                type="string",
                indexing=["index", "summary"],
                index="enable-bm25",
            ),
        )
        self.app_package.schema.add_field_set(
            FieldSet(name="default", fields=["title"]))
        self.app_package.schema.add_rank_profile(
            RankProfile(name="bm25", first_phase="bm25(title)"))
        self.bert_config = BertModelConfig(
            model_id="pretrained_bert_tiny",
            tokenizer="google/bert_uncased_L-2_H-128_A-2",
            model="google/bert_uncased_L-2_H-128_A-2",
            query_input_size=5,
            doc_input_size=10,
        )
        self.app_package.add_model_ranking(
            model_config=self.bert_config,
            include_model_summary_features=True,
            inherits="default",
            first_phase="bm25(title)",
            second_phase=SecondPhaseRanking(rerank_count=10,
                                            expression="logit1"),
        )
        self.disk_folder = os.path.join(os.getenv("WORK_DIR"),
                                        "sample_application")
        self.vespa_docker = VespaDocker(port=8089)
        self.app = self.vespa_docker.deploy(
            application_package=self.app_package, disk_folder=self.disk_folder)

    def test_deploy(self):
        self.assertTrue(
            any(
                re.match("Generation: [0-9]+", line)
                for line in self.app.deployment_message))
        self.assertEqual(self.app.get_application_status().status_code, 200)

    def test_data_operation(self):
        #
        # Get data that does not exist
        #
        self.assertEqual(
            self.app.get_data(schema="cord19", data_id="1").status_code, 404)
        #
        # Feed a data point
        #
        fields = {
            "cord_uid": "1",
            "title": "this is my first title",
        }
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.feed_data_point(
            schema="cord19",
            data_id="1",
            fields=fields,
        )
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Get data that exist
        #
        response = self.app.get_data(schema="cord19", data_id="1")
        self.assertEqual(response.status_code, 200)
        embedding_values = fields["pretrained_bert_tiny_doc_token_ids"][
            "values"]
        self.assertDictEqual(
            response.json(),
            {
                "fields": {
                    "cord_uid": "1",
                    "title": "this is my first title",
                    "pretrained_bert_tiny_doc_token_ids": {
                        "cells": [{
                            "address": {
                                "d0": str(x)
                            },
                            "value": float(embedding_values[x]),
                        } for x in range(len(embedding_values))]
                    },
                },
                "id": "id:cord19:cord19::1",
                "pathId": "/document/v1/cord19/cord19/docid/1",
            },
        )
        #
        # Update data
        #
        fields = {"title": "this is my updated title"}
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.update_data(schema="cord19",
                                        data_id="1",
                                        fields=fields)
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Get the updated data point
        #
        response = self.app.get_data(schema="cord19", data_id="1")
        self.assertEqual(response.status_code, 200)
        embedding_values = fields["pretrained_bert_tiny_doc_token_ids"][
            "values"]
        self.assertDictEqual(
            response.json(),
            {
                "fields": {
                    "cord_uid": "1",
                    "title": "this is my updated title",
                    "pretrained_bert_tiny_doc_token_ids": {
                        "cells": [{
                            "address": {
                                "d0": str(x)
                            },
                            "value": float(embedding_values[x]),
                        } for x in range(len(embedding_values))]
                    },
                },
                "id": "id:cord19:cord19::1",
                "pathId": "/document/v1/cord19/cord19/docid/1",
            },
        )
        #
        # Delete a data point
        #
        response = self.app.delete_data(schema="cord19", data_id="1")
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Deleted data should be gone
        #
        self.assertEqual(
            self.app.get_data(schema="cord19", data_id="1").status_code, 404)

    def _parse_vespa_tensor(self, hit, feature):
        return [
            x["value"]
            for x in hit["fields"]["summaryfeatures"][feature]["cells"]
        ]

    def test_rank_input_output(self):
        #
        # Feed a data point
        #
        fields = {
            "cord_uid": "1",
            "title": "this is my first title",
        }
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.feed_data_point(
            schema="cord19",
            data_id="1",
            fields=fields,
        )
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Run a test query
        #
        result = self.app.query(
            query="this is a test",
            query_model=QueryModel(
                query_properties=[
                    QueryRankingFeature(
                        name=self.bert_config.query_token_ids_name,
                        mapping=self.bert_config.query_tensor_mapping,
                    )
                ],
                match_phase=OR(),
                rank_profile=Ranking(name="pretrained_bert_tiny"),
            ),
        )
        vespa_input_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(input_ids)")
        vespa_attention_mask = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(attention_mask)")
        vespa_token_type_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(token_type_ids)")

        expected_inputs = self.bert_config.create_encodings(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0])
        self.assertEqual(vespa_attention_mask,
                         expected_inputs["attention_mask"][0])
        self.assertEqual(vespa_token_type_ids,
                         expected_inputs["token_type_ids"][0])

        expected_logits = self.bert_config.predict(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit0)"],
            expected_logits[0][0],
            5,
        )
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit1)"],
            expected_logits[0][1],
            5,
        )

    def tearDown(self) -> None:
        shutil.rmtree(self.disk_folder, ignore_errors=True)
        self.vespa_docker.container.stop()
        self.vespa_docker.container.remove()