def test_body_function(self): def body_function(query): body = { "yql": "select * from sources * where userQuery();", "query": query, "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, } return body query_model = QueryModel(body_function=body_function) self.assertDictEqual( query_model.create_body(query=self.query), { "yql": "select * from sources * where userQuery();", "query": "this is a test", "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, }, )
def test_default(self): query = QueryModel() self.assertDictEqual( query.create_body(query=self.query), { "yql": 'select * from sources * where (userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, }, )
def bert_model_input_and_output(self, app, schema_name, fields_to_send, model_config): # # Feed a data point # response = app.feed_data_point( schema=schema_name, data_id=fields_to_send["id"], fields=fields_to_send, ) self.assertEqual( response.json["id"], "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send["id"]), ) # # Run a test query # result = app.query( query="this is a test", query_model=QueryModel( query_properties=[ QueryRankingFeature( name=model_config.query_token_ids_name, mapping=model_config.query_tensor_mapping, ) ], match_phase=OR(), rank_profile=Ranking(name="pretrained_bert_tiny"), ), ) vespa_input_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(input_ids)") vespa_attention_mask = self._parse_vespa_tensor( result.hits[0], "rankingExpression(attention_mask)") vespa_token_type_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(token_type_ids)") expected_inputs = model_config.create_encodings( queries=["this is a test"], docs=[fields_to_send["title"]]) self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0]) self.assertEqual(vespa_attention_mask, expected_inputs["attention_mask"][0]) self.assertEqual(vespa_token_type_ids, expected_inputs["token_type_ids"][0]) expected_logits = model_config.predict(queries=["this is a test"], docs=[fields_to_send["title"]]) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit0)"], expected_logits[0][0], 5, ) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit1)"], expected_logits[0][1], 5, )
def feed_batch_synchronous_mode(self, app, schema_name, fields_to_send): """ Sync feed a batch of data to the application :param app: Vespa instance holding the connection to the application :param schema_name: Schema name containing the document we want to send and retrieve data :param fields_to_send: List of Dicts where keys are field names and values are field values. Must contain 'id' field. :return: """ # # Create and feed documents # num_docs = len(fields_to_send) docs = [] schema = schema_name for fields in fields_to_send: docs.append({"id": fields["id"], "fields": fields}) app.feed_batch(schema=schema, batch=docs, asynchronous=False) # Verify that all documents are fed result = app.query(query="sddocname:{}".format(schema_name), query_model=QueryModel()) self.assertEqual(result.number_documents_indexed, num_docs)
def test_collect_training_data_point(self): self.app.query = Mock(side_effect=[ VespaResult(self.raw_vespa_result_recall), VespaResult(self.raw_vespa_result_additional), ]) query_model = QueryModel(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data_point( query="this is a query", query_id="123", relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, fields=["rankfeatures", "title"], timeout="15s", ) self.assertEqual(self.app.query.call_count, 2) self.app.query.assert_has_calls([ call( query="this is a query", query_model=query_model, recall=("vespa_id_field", ["abc"]), timeout="15s", ), call( query="this is a query", query_model=query_model, hits=2, timeout="15s", ), ]) expected_data = [ { "document_id": "abc", "query_id": "123", "label": 1, "a": 1, "b": 2, "title": "this is a title", }, { "document_id": "def", "query_id": "123", "label": 0, "a": 3, "b": 4, "title": "this is a title 2", }, { "document_id": "ghi", "query_id": "123", "label": 0, "a": 5, "b": 6, "title": "this is a title 3", }, ] self.assertEqual(data, expected_data)
def test_evaluate_query(self): self.app.query = Mock(return_value={}) eval_metric = Mock() eval_metric.evaluate_query = Mock(return_value={"metric": 1}) eval_metric2 = Mock() eval_metric2.evaluate_query = Mock(return_value={"metric_2": 2}) query_model = QueryModel() evaluation = self.app.evaluate_query( eval_metrics=[eval_metric, eval_metric2], query_model=query_model, query_id="0", query="this is a test", id_field="vespa_id_field", relevant_docs=self.labeled_data[0]["relevant_docs"], default_score=0, hits=10, ) self.assertEqual(self.app.query.call_count, 1) self.app.query.assert_has_calls([ call(query="this is a test", query_model=query_model, hits=10), ]) self.assertEqual(eval_metric.evaluate_query.call_count, 1) eval_metric.evaluate_query.assert_has_calls([ call({}, self.labeled_data[0]["relevant_docs"], "vespa_id_field", 0), ]) self.assertDictEqual(evaluation, { "query_id": "0", "metric": 1, "metric_2": 2 })
def test_rank_input_output(self): # # Feed a data point # fields = { "cord_uid": "1", "title": "this is my first title", } fields.update(self.bert_config.doc_fields(text=str(fields["title"]))) response = self.app.feed_data_point( schema="cord19", data_id="1", fields=fields, ) self.assertEqual(response.json()["id"], "id:cord19:cord19::1") # # Run a test query # result = self.app.query( query="this is a test", query_model=QueryModel( query_properties=[ QueryRankingFeature( name=self.bert_config.query_token_ids_name, mapping=self.bert_config.query_tensor_mapping, ) ], match_phase=OR(), rank_profile=Ranking(name="pretrained_bert_tiny"), ), ) vespa_input_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(input_ids)") vespa_attention_mask = self._parse_vespa_tensor( result.hits[0], "rankingExpression(attention_mask)") vespa_token_type_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(token_type_ids)") expected_inputs = self.bert_config.create_encodings( queries=["this is a test"], docs=["this is my first title"]) self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0]) self.assertEqual(vespa_attention_mask, expected_inputs["attention_mask"][0]) self.assertEqual(vespa_token_type_ids, expected_inputs["token_type_ids"][0]) expected_logits = self.bert_config.predict( queries=["this is a test"], docs=["this is my first title"]) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit0)"], expected_logits[0][0], 5, ) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit1)"], expected_logits[0][1], 5, )
def test_collect_training_data(self): app = Vespa(url="https://api.cord19.vespa.ai") query_model = QueryModel(match_phase=OR(), rank_profile=Ranking(name="bm25", list_features=True)) labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertEqual(training_data_batch.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data_batch.shape[1] > 3) training_data = [] for query_data in labeled_data: for doc_data in query_data["relevant_docs"]: training_data_point = app.collect_training_data_point( query=query_data["query"], query_id=query_data["query_id"], relevant_id=doc_data["id"], id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) training_data.extend(training_data_point) training_data = DataFrame.from_records(training_data) self.assertEqual(training_data.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data.shape[1] > 3)
def test_query(self): app = Vespa(url="http://localhost", port=8080) body = {"yql": "select * from sources * where test"} self.assertDictEqual( app.query(body=body, debug_request=True).request_body, body) self.assertDictEqual( app.query( query="this is a test", query_model=QueryModel(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, }, ) self.assertDictEqual( app.query( query="this is a test", query_model=QueryModel(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, recall=("id", [1, 5]), ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, "recall": "+(id:1 id:5)", }, )
def test_collect_training_data(self): mock_return_value = [ { "document_id": "abc", "query_id": "123", "relevant": 1, "a": 1, "b": 2, }, { "document_id": "def", "query_id": "123", "relevant": 0, "a": 3, "b": 4, }, { "document_id": "ghi", "query_id": "123", "relevant": 0, "a": 5, "b": 6, }, ] self.app.collect_training_data_point = Mock( return_value=mock_return_value) labeled_data = [{ "query_id": 123, "query": "this is a query", "relevant_docs": [{ "id": "abc", "score": 1 }], }] query_model = QueryModel(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data( labeled_data=labeled_data, id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, timeout="15s", ) self.app.collect_training_data_point.assert_has_calls([ call( query="this is a query", query_id=123, relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, relevant_score=1, default_score=0, timeout="15s", ) ]) assert_frame_equal(data, DataFrame.from_records(mock_return_value))
def test_collect_training_data_point_0_recall_hits(self): self.raw_vespa_result_recall = { "root": { "id": "toplevel", "relevance": 1.0, "fields": { "totalCount": 0 }, "coverage": { "coverage": 100, "documents": 62529, "full": True, "nodes": 2, "results": 1, "resultsFull": 1, }, } } self.app.query = Mock(side_effect=[ VespaQueryResponse( self.raw_vespa_result_recall, status_code=None, url=None), VespaQueryResponse( self.raw_vespa_result_additional, status_code=None, url=None), ]) query_model = QueryModel(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data_point( query="this is a query", query_id="123", relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], timeout="15s", ) self.assertEqual(self.app.query.call_count, 1) self.app.query.assert_has_calls([ call( query="this is a query", query_model=query_model, recall=("vespa_id_field", ["abc"]), timeout="15s", ), ]) expected_data = [] self.assertEqual(data, expected_data)
def test_query_properties_match_and_rank(self): query_model = QueryModel( query_properties=[ QueryRankingFeature(name="query_vector", mapping=lambda x: [1, 2, 3]) ], match_phase=OR(), rank_profile=RankProfile(name="bm25", list_features=True), ) self.assertDictEqual( query_model.create_body(query=self.query), { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "bm25", "listFeatures": "true" }, "ranking.features.query(query_vector)": "[1, 2, 3]", }, ) query_model = QueryModel( query_properties=[ QueryRankingFeature(name="query_vector", mapping=lambda x: [1, 2, 3]) ], match_phase=ANN( doc_vector="doc_vector", query_vector="query_vector", hits=10, label="label", ), rank_profile=RankProfile(name="bm25", list_features=True), ) self.assertDictEqual( query_model.create_body(query=self.query), { "yql": 'select * from sources * where ([{"targetNumHits": 10, "label": "label", "approximate": true}]nearestNeighbor(doc_vector, query_vector));', "ranking": { "profile": "bm25", "listFeatures": "true" }, "ranking.features.query(query_vector)": "[1, 2, 3]", }, )
def test_query_with_body_function(self): app = Vespa(url="http://localhost", port=8080) def body_function(query): body = { "yql": "select * from sources * where userQuery();", "query": query, "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, } return body query_model = QueryModel(body_function=body_function) self.assertDictEqual( app.query( query="this is a test", query_model=query_model, debug_request=True, hits=10, recall=("id", [1, 5]), ).request_body, { "yql": "select * from sources * where userQuery();", "query": "this is a test", "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, "hits": 10, "recall": "+(id:1 id:5)", }, )
async def execute_async_data_operations( self, app, schema_name, fields_to_send, fields_to_update, expected_fields_from_get_operation, ): """ Async feed, get, update and delete data to/from the application :param app: Vespa instance holding the connection to the application :param schema_name: Schema name containing the document we want to send and retrieve data :param fields_to_send: List of Dicts where keys are field names and values are field values. Must contain 'id' field. :param fields_to_update: Dict where keys are field names and values are field values. :param expected_fields_from_get_operation: Dict containing fields as returned by Vespa get operation. There are cases where fields returned from Vespa are different than inputs, e.g. when dealing with Tensors. :return: """ async with app.asyncio() as async_app: # # Get data that does not exist # response = await async_app.get_data( schema=schema_name, data_id=fields_to_send[0]["id"]) self.assertEqual(response.status_code, 404) # # Feed some data points # feed = [] for fields in fields_to_send: feed.append( asyncio.create_task( async_app.feed_data_point( schema=schema_name, data_id=fields["id"], fields=fields, ))) await asyncio.wait(feed, return_when=asyncio.ALL_COMPLETED) result = feed[0].result().json self.assertEqual( result["id"], "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send[0]["id"]), ) self.assertEqual( await async_app.feed_data_point( schema="msmarco", data_id="1", fields={ "id": "1", "title": "this is title 1", "body": "this is body 1", }, ), app.feed_data_point( schema="msmarco", data_id="1", fields={ "id": "1", "title": "this is title 1", "body": "this is body 1", }, ), ) # # Get data that exists # response = await async_app.get_data( schema=schema_name, data_id=fields_to_send[0]["id"]) self.assertEqual(response.status_code, 200) result = response.json self.assertDictEqual( result, { "fields": expected_fields_from_get_operation[0], "id": "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send[0]["id"]), "pathId": "/document/v1/{}/{}/docid/{}".format( schema_name, schema_name, fields_to_send[0]["id"]), }, ) # # Update data # response = await async_app.update_data( schema=schema_name, data_id=fields_to_send[0]["id"], fields=fields_to_update, ) result = response.json self.assertEqual( result["id"], "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send[0]["id"]), ) # # Get the updated data point # response = await async_app.get_data( schema=schema_name, data_id=fields_to_send[0]["id"]) self.assertEqual(response.status_code, 200) result = response.json expected_result = { k: v for k, v in expected_fields_from_get_operation[0].items() } expected_result.update(fields_to_update) self.assertDictEqual( result, { "fields": expected_result, "id": "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send[0]["id"]), "pathId": "/document/v1/{}/{}/docid/{}".format( schema_name, schema_name, fields_to_send[0]["id"]), }, ) # # Delete a data point # response = await async_app.delete_data( schema=schema_name, data_id=fields_to_send[0]["id"]) result = response.json self.assertEqual( result["id"], "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send[0]["id"]), ) # # Deleted data should be gone # response = await async_app.get_data( schema=schema_name, data_id=fields_to_send[0]["id"]) self.assertEqual(response.status_code, 404) # # Issue a bunch of queries in parallel # queries = [] for i in range(10): queries.append( asyncio.create_task( async_app.query( query="sddocname:{}".format(schema_name), query_model=QueryModel(), timeout=5000, ))) await asyncio.wait(queries, return_when=asyncio.ALL_COMPLETED) self.assertEqual(queries[0].result().number_documents_indexed, len(fields_to_send) - 1)
def test_workflow(self): # # Connect to a running Vespa Application # app = Vespa(url="https://api.cord19.vespa.ai") # # Define a query model # match_phase = Union( WeakAnd(hits=10), ANN( doc_vector="title_embedding", query_vector="title_vector", hits=10, label="title", ), ) rank_profile = Ranking(name="bm25", list_features=True) query_model = QueryModel( name="ANN_bm25", query_properties=[ QueryRankingFeature( name="title_vector", mapping=lambda x: [random() for x in range(768)], ) ], match_phase=match_phase, rank_profile=rank_profile, ) # # Query Vespa app # query_result = app.query( query="Is remdesivir an effective treatment for COVID-19?", query_model=query_model, ) self.assertTrue(query_result.number_documents_retrieved > 0) self.assertEqual(len(query_result.hits), 10) # # Define labelled data # labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] # equivalent data in df format labeled_data_df = DataFrame( data={ "qid": [0, 0, 1, 1], "query": ["Intrauterine virus infections and congenital heart disease"] * 2 + [ "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus" ] * 2, "doc_id": [0, 3, 1, 5], "relevance": [1, 1, 1, 1], }) # # Collect training data # training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertTrue(training_data_batch.shape[0] > 0) self.assertEqual( len({"document_id", "query_id", "label"}.intersection(set(training_data_batch.columns))), 3, ) # # Evaluate a query model # eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)] evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=query_model, id_field="id", ) self.assertEqual(evaluation.shape, (9, 1)) # # AssertionError - two models with the same name # with self.assertRaises(AssertionError): _ = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), QueryModel(), query_model], id_field="id", ) evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), query_model], id_field="id", ) self.assertEqual(evaluation.shape, (9, 2)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, ) self.assertEqual(evaluation.shape, (15, 1)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, per_query=True, ) self.assertEqual(evaluation.shape, (2, 7))