示例#1
0
 def test_or(self):
     match_filter = OR()
     self.assertEqual(
         match_filter.create_match_filter(query=self.query),
         '([{"grammar": "any"}]userInput("this is  a test"))',
     )
     self.assertDictEqual(
         match_filter.get_query_properties(query=self.query), {})
示例#2
0
    def bert_model_input_and_output(self, app, schema_name, fields_to_send,
                                    model_config):
        #
        # Feed a data point
        #
        response = app.feed_data_point(
            schema=schema_name,
            data_id=fields_to_send["id"],
            fields=fields_to_send,
        )
        self.assertEqual(
            response.json["id"],
            "id:{}:{}::{}".format(schema_name, schema_name,
                                  fields_to_send["id"]),
        )
        #
        # Run a test query
        #
        result = app.query(
            query="this is a test",
            query_model=QueryModel(
                query_properties=[
                    QueryRankingFeature(
                        name=model_config.query_token_ids_name,
                        mapping=model_config.query_tensor_mapping,
                    )
                ],
                match_phase=OR(),
                rank_profile=Ranking(name="pretrained_bert_tiny"),
            ),
        )
        vespa_input_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(input_ids)")
        vespa_attention_mask = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(attention_mask)")
        vespa_token_type_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(token_type_ids)")

        expected_inputs = model_config.create_encodings(
            queries=["this is a test"], docs=[fields_to_send["title"]])
        self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0])
        self.assertEqual(vespa_attention_mask,
                         expected_inputs["attention_mask"][0])
        self.assertEqual(vespa_token_type_ids,
                         expected_inputs["token_type_ids"][0])

        expected_logits = model_config.predict(queries=["this is a test"],
                                               docs=[fields_to_send["title"]])
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit0)"],
            expected_logits[0][0],
            5,
        )
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit1)"],
            expected_logits[0][1],
            5,
        )
示例#3
0
    def test_rank_input_output(self):
        #
        # Feed a data point
        #
        fields = {
            "cord_uid": "1",
            "title": "this is my first title",
        }
        fields.update(self.bert_config.doc_fields(text=str(fields["title"])))
        response = self.app.feed_data_point(
            schema="cord19",
            data_id="1",
            fields=fields,
        )
        self.assertEqual(response.json()["id"], "id:cord19:cord19::1")
        #
        # Run a test query
        #
        result = self.app.query(
            query="this is a test",
            query_model=QueryModel(
                query_properties=[
                    QueryRankingFeature(
                        name=self.bert_config.query_token_ids_name,
                        mapping=self.bert_config.query_tensor_mapping,
                    )
                ],
                match_phase=OR(),
                rank_profile=Ranking(name="pretrained_bert_tiny"),
            ),
        )
        vespa_input_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(input_ids)")
        vespa_attention_mask = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(attention_mask)")
        vespa_token_type_ids = self._parse_vespa_tensor(
            result.hits[0], "rankingExpression(token_type_ids)")

        expected_inputs = self.bert_config.create_encodings(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0])
        self.assertEqual(vespa_attention_mask,
                         expected_inputs["attention_mask"][0])
        self.assertEqual(vespa_token_type_ids,
                         expected_inputs["token_type_ids"][0])

        expected_logits = self.bert_config.predict(
            queries=["this is a test"], docs=["this is my first title"])
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit0)"],
            expected_logits[0][0],
            5,
        )
        self.assertAlmostEqual(
            result.hits[0]["fields"]["summaryfeatures"]
            ["rankingExpression(logit1)"],
            expected_logits[0][1],
            5,
        )
示例#4
0
    def test_collect_training_data(self):
        app = Vespa(url="https://api.cord19.vespa.ai")
        query_model = QueryModel(match_phase=OR(),
                                 rank_profile=Ranking(name="bm25",
                                                      list_features=True))
        labeled_data = [
            {
                "query_id": 0,
                "query":
                "Intrauterine virus infections and congenital heart disease",
                "relevant_docs": [{
                    "id": 0,
                    "score": 1
                }, {
                    "id": 3,
                    "score": 1
                }],
            },
            {
                "query_id": 1,
                "query":
                "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
                "relevant_docs": [{
                    "id": 1,
                    "score": 1
                }, {
                    "id": 5,
                    "score": 1
                }],
            },
        ]
        training_data_batch = app.collect_training_data(
            labeled_data=labeled_data,
            id_field="id",
            query_model=query_model,
            number_additional_docs=2,
            fields=["rankfeatures"],
        )
        self.assertEqual(training_data_batch.shape[0], 12)
        # It should have at least one rank feature in addition to document_id, query_id and	label
        self.assertTrue(training_data_batch.shape[1] > 3)

        training_data = []
        for query_data in labeled_data:
            for doc_data in query_data["relevant_docs"]:
                training_data_point = app.collect_training_data_point(
                    query=query_data["query"],
                    query_id=query_data["query_id"],
                    relevant_id=doc_data["id"],
                    id_field="id",
                    query_model=query_model,
                    number_additional_docs=2,
                    fields=["rankfeatures"],
                )
                training_data.extend(training_data_point)
        training_data = DataFrame.from_records(training_data)

        self.assertEqual(training_data.shape[0], 12)
        # It should have at least one rank feature in addition to document_id, query_id and	label
        self.assertTrue(training_data.shape[1] > 3)
示例#5
0
    def test_query(self):
        app = Vespa(url="http://localhost", port=8080)

        body = {"yql": "select * from sources * where test"}
        self.assertDictEqual(
            app.query(body=body, debug_request=True).request_body, body)

        self.assertDictEqual(
            app.query(
                query="this is a test",
                query_model=Query(match_phase=OR(),
                                  rank_profile=RankProfile()),
                debug_request=True,
                hits=10,
            ).request_body,
            {
                "yql":
                'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
                "ranking": {
                    "profile": "default",
                    "listFeatures": "false"
                },
                "hits": 10,
            },
        )

        self.assertDictEqual(
            app.query(
                query="this is a test",
                query_model=Query(match_phase=OR(),
                                  rank_profile=RankProfile()),
                debug_request=True,
                hits=10,
                recall=("id", [1, 5]),
            ).request_body,
            {
                "yql":
                'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
                "ranking": {
                    "profile": "default",
                    "listFeatures": "false"
                },
                "hits": 10,
                "recall": "+(id:1 id:5)",
            },
        )
示例#6
0
    def test_query_properties_match_and_rank(self):

        query_model = QueryModel(
            query_properties=[
                QueryRankingFeature(name="query_vector",
                                    mapping=lambda x: [1, 2, 3])
            ],
            match_phase=OR(),
            rank_profile=RankProfile(name="bm25", list_features=True),
        )
        self.assertDictEqual(
            query_model.create_body(query=self.query),
            {
                "yql":
                'select * from sources * where ([{"grammar": "any"}]userInput("this is  a test"));',
                "ranking": {
                    "profile": "bm25",
                    "listFeatures": "true"
                },
                "ranking.features.query(query_vector)": "[1, 2, 3]",
            },
        )

        query_model = QueryModel(
            query_properties=[
                QueryRankingFeature(name="query_vector",
                                    mapping=lambda x: [1, 2, 3])
            ],
            match_phase=ANN(
                doc_vector="doc_vector",
                query_vector="query_vector",
                hits=10,
                label="label",
            ),
            rank_profile=RankProfile(name="bm25", list_features=True),
        )
        self.assertDictEqual(
            query_model.create_body(query=self.query),
            {
                "yql":
                'select * from sources * where ([{"targetNumHits": 10, "label": "label", "approximate": true}]nearestNeighbor(doc_vector, query_vector));',
                "ranking": {
                    "profile": "bm25",
                    "listFeatures": "true"
                },
                "ranking.features.query(query_vector)": "[1, 2, 3]",
            },
        )