def test_parse_as_dataframe(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) df = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_dataframe() self.assertIsInstance(df, pd.DataFrame) self.assertEqual(set(df.index.names), {"classification_type", "global_metrics.field.name"}) self.assertEqual(set(df.columns), {"avg_f1_micro", "avg_nb_classes", "doc_count"}) self.assertEqual( df.index.to_list(), [ ("multilabel", "ispracticecompatible"), ("multilabel", "gpc"), ("multilabel", "preservationmethods"), ("multiclass", "kind"), ("multiclass", "gpc"), ], ) self.assertEqual( df.to_dict(orient="rows"), [ { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128 }, { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119 }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76 }, { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370 }, { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198 }, ], )
def test_parse_as_tree(self, *_): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_tree() self.assertIsInstance(response, AggsResponseTree) self.assertEqual(response.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR)
def test_grouping_agg(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) agg_response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ) # none provided self.assertEqual( agg_response._grouping_agg().identifier, "global_metrics.field.name" ) # fake provided with self.assertRaises(ValueError): agg_response._grouping_agg("yolo") # not bucket provided with self.assertRaises(ValueError): agg_response._grouping_agg("avg_f1_micro") # real provided self.assertEqual( agg_response._grouping_agg("global_metrics.field.name").identifier, "global_metrics.field.name", )
def test_parse_as_tabular(self): # with single agg at root my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_tabular(row_as_tuple=True) self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, [ ( ("multilabel", "ispracticecompatible"), { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128 }, ), ( ("multilabel", "gpc"), { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119 }, ), ( ("multilabel", "preservationmethods"), { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76 }, ), ( ("multiclass", "kind"), { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370 }, ), ( ("multiclass", "gpc"), { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198 }, ), ], )
def test_parse_as_tabular_multiple_roots(self): # with multiple aggs at root my_agg = Aggs({ "classification_type": { "terms": { "field": "classification_type" } }, "avg_f1_score": { "avg": { "field": "global_metrics.performance.test.micro.f1_score" } }, }) raw_response = { "classification_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "multiclass", "doc_count": 439 }, { "key": "multilabel", "doc_count": 433 }, ], }, "avg_f1_score": { "value": 0.815 }, } index_names, index_values = Aggregations( data=raw_response, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_tabular(row_as_tuple=True, expand_sep=" || ") self.assertEqual(index_names, []) self.assertEqual( index_values, [( (), { "avg_f1_score": 0.815, "classification_type || multiclass": 439, "classification_type || multilabel": 433, }, )], )
def test_normalize_buckets(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_normalized() self.assertEqual(ordered(response), ordered(sample.EXPECTED_NORMALIZED_RESPONSE))
def scan_composite_agg_at_once(self, size: int) -> Aggregations: """Iterate over the whole aggregation composed buckets (converting Aggs into composite agg if possible), and return all buckets at once in a Aggregations instance. """ all_buckets = list(self.scan_composite_agg(size=size)) s: Search = self._clone().size(0) s._aggs = s._aggs.as_composite(size=size) agg_name: AggName agg_name, _ = s._aggs.get_composition_supporting_agg() # type: ignore # artificially merge all buckets as if they were returned in a single query return Aggregations(_search=s, data={agg_name: { "buckets": all_buckets }})
def test_parse_as_dataframe(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) df = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_dataframe( grouped_by="global_metrics.field.name") self.assertIsInstance(df, pd.DataFrame) self.assertEqual(set(df.index.names), {"classification_type", "global_metrics.field.name"}) self.assertEqual(set(df.columns), {"avg_f1_micro", "avg_nb_classes", "doc_count"}) self.assertEqual( df.to_dict(orient="index"), { ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, }, )
def test_grouping_agg(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) agg_response = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)) # none provided self.assertIsNone(agg_response._grouping_agg()[0]) # fake provided with self.assertRaises(KeyError): agg_response._grouping_agg("yolo") # not bucket provided with self.assertRaises(ValueError): agg_response._grouping_agg("avg_f1_micro") # real provided self.assertEqual( agg_response._grouping_agg("global_metrics.field.name")[0], "global_metrics.field.name", )
def test_parse_as_tabular_multiple_roots(self): # with multiple aggs at root my_agg = Aggs({ "classification_type": { "terms": { "field": "classification_type" } }, "avg_f1_score": { "avg": { "field": "global_metrics.performance.test.micro.f1_score" } }, }) raw_response = { "classification_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "multiclass", "doc_count": 439 }, { "key": "multilabel", "doc_count": 433 }, ], }, "avg_f1_score": { "value": 0.815 }, } index_names, index_values = Aggregations( data=raw_response, _search=Search().aggs(my_agg)).to_tabular(index_orient=True, expand_sep=" || ") self.assertEqual(index_names, []) self.assertEqual( index_values, { (): { "avg_f1_score": 0.815, "classification_type || multiclass": 439, "classification_type || multilabel": 433, } }, ) # with specified grouped_by index_names, index_values = Aggregations( data=raw_response, _search=Search().aggs(my_agg)).to_tabular( grouped_by="classification_type") self.assertEqual(index_names, ["classification_type"]) self.assertEqual( index_values, { ("multiclass", ): { "doc_count": 439 }, ("multilabel", ): { "doc_count": 433 } }, )
def test_parse_as_tabular(self): # with single agg at root my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_tabular( index_orient=True, grouped_by="global_metrics.field.name") self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, { ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, }, ) # index_orient = False index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_tabular( index_orient=False, grouped_by="global_metrics.field.name") self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, [ { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "classification_type": "multilabel", "doc_count": 128, "global_metrics.field.name": "ispracticecompatible", }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "classification_type": "multilabel", "doc_count": 76, "global_metrics.field.name": "preservationmethods", }, { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "classification_type": "multiclass", "doc_count": 370, "global_metrics.field.name": "kind", }, { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "classification_type": "multiclass", "doc_count": 198, "global_metrics.field.name": "gpc", }, ], )
def test_normalize_buckets(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_normalized() self.assertEqual(ordered(response), ordered(sample.EXPECTED_NORMALIZED_RESPONSE))