예제 #1
0
    def test_parse_as_dataframe(self):
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING)
        df = Aggregations(
            data=sample.ES_AGG_RESPONSE,
            aggs=my_agg,
            index=None,
            client=None,
            query=None,
        ).serialize_as_dataframe()
        self.assertIsInstance(df, pd.DataFrame)
        self.assertEqual(set(df.index.names),
                         {"classification_type", "global_metrics.field.name"})
        self.assertEqual(set(df.columns),
                         {"avg_f1_micro", "avg_nb_classes", "doc_count"})
        self.assertEqual(
            df.index.to_list(),
            [
                ("multilabel", "ispracticecompatible"),
                ("multilabel", "gpc"),
                ("multilabel", "preservationmethods"),
                ("multiclass", "kind"),
                ("multiclass", "gpc"),
            ],
        )

        self.assertEqual(
            df.to_dict(orient="rows"),
            [
                {
                    "avg_f1_micro": 0.72,
                    "avg_nb_classes": 18.71,
                    "doc_count": 128
                },
                {
                    "avg_f1_micro": 0.95,
                    "avg_nb_classes": 183.21,
                    "doc_count": 119
                },
                {
                    "avg_f1_micro": 0.8,
                    "avg_nb_classes": 9.97,
                    "doc_count": 76
                },
                {
                    "avg_f1_micro": 0.89,
                    "avg_nb_classes": 206.5,
                    "doc_count": 370
                },
                {
                    "avg_f1_micro": 0.93,
                    "avg_nb_classes": 211.12,
                    "doc_count": 198
                },
            ],
        )
예제 #2
0
 def test_parse_as_tree(self, *_):
     my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING)
     response = Aggregations(
         data=sample.ES_AGG_RESPONSE,
         aggs=my_agg,
         index=None,
         client=None,
         query=None,
     ).to_tree()
     self.assertIsInstance(response, AggsResponseTree)
     self.assertEqual(response.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR)
예제 #3
0
    def test_grouping_agg(self):
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING)
        agg_response = Aggregations(
            data=sample.ES_AGG_RESPONSE,
            aggs=my_agg,
            index=None,
            client=None,
            query=None,
        )

        # none provided
        self.assertEqual(
            agg_response._grouping_agg().identifier, "global_metrics.field.name"
        )
        # fake provided
        with self.assertRaises(ValueError):
            agg_response._grouping_agg("yolo")
        # not bucket provided
        with self.assertRaises(ValueError):
            agg_response._grouping_agg("avg_f1_micro")
        # real provided
        self.assertEqual(
            agg_response._grouping_agg("global_metrics.field.name").identifier,
            "global_metrics.field.name",
        )
예제 #4
0
    def test_parse_as_tabular(self):
        # with single agg at root
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING)
        index_names, index_values = Aggregations(
            data=sample.ES_AGG_RESPONSE,
            aggs=my_agg,
            index=None,
            client=None,
            query=None,
        ).serialize_as_tabular(row_as_tuple=True)

        self.assertEqual(index_names,
                         ["classification_type", "global_metrics.field.name"])
        self.assertEqual(
            index_values,
            [
                (
                    ("multilabel", "ispracticecompatible"),
                    {
                        "avg_f1_micro": 0.72,
                        "avg_nb_classes": 18.71,
                        "doc_count": 128
                    },
                ),
                (
                    ("multilabel", "gpc"),
                    {
                        "avg_f1_micro": 0.95,
                        "avg_nb_classes": 183.21,
                        "doc_count": 119
                    },
                ),
                (
                    ("multilabel", "preservationmethods"),
                    {
                        "avg_f1_micro": 0.8,
                        "avg_nb_classes": 9.97,
                        "doc_count": 76
                    },
                ),
                (
                    ("multiclass", "kind"),
                    {
                        "avg_f1_micro": 0.89,
                        "avg_nb_classes": 206.5,
                        "doc_count": 370
                    },
                ),
                (
                    ("multiclass", "gpc"),
                    {
                        "avg_f1_micro": 0.93,
                        "avg_nb_classes": 211.12,
                        "doc_count": 198
                    },
                ),
            ],
        )
예제 #5
0
    def test_parse_as_tabular_multiple_roots(self):
        # with multiple aggs at root
        my_agg = Aggs({
            "classification_type": {
                "terms": {
                    "field": "classification_type"
                }
            },
            "avg_f1_score": {
                "avg": {
                    "field": "global_metrics.performance.test.micro.f1_score"
                }
            },
        })

        raw_response = {
            "classification_type": {
                "doc_count_error_upper_bound":
                0,
                "sum_other_doc_count":
                0,
                "buckets": [
                    {
                        "key": "multiclass",
                        "doc_count": 439
                    },
                    {
                        "key": "multilabel",
                        "doc_count": 433
                    },
                ],
            },
            "avg_f1_score": {
                "value": 0.815
            },
        }
        index_names, index_values = Aggregations(
            data=raw_response,
            aggs=my_agg,
            index=None,
            client=None,
            query=None,
        ).serialize_as_tabular(row_as_tuple=True, expand_sep=" || ")

        self.assertEqual(index_names, [])
        self.assertEqual(
            index_values,
            [(
                (),
                {
                    "avg_f1_score": 0.815,
                    "classification_type || multiclass": 439,
                    "classification_type || multilabel": 433,
                },
            )],
        )
예제 #6
0
 def test_normalize_buckets(self):
     my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING)
     response = Aggregations(
         data=sample.ES_AGG_RESPONSE,
         aggs=my_agg,
         index=None,
         client=None,
         query=None,
     ).serialize_as_normalized()
     self.assertEqual(ordered(response),
                      ordered(sample.EXPECTED_NORMALIZED_RESPONSE))
예제 #7
0
 def scan_composite_agg_at_once(self, size: int) -> Aggregations:
     """Iterate over the whole aggregation composed buckets (converting Aggs into composite agg if possible), and
     return all buckets at once in a Aggregations instance.
     """
     all_buckets = list(self.scan_composite_agg(size=size))
     s: Search = self._clone().size(0)
     s._aggs = s._aggs.as_composite(size=size)
     agg_name: AggName
     agg_name, _ = s._aggs.get_composition_supporting_agg()  # type: ignore
     # artificially merge all buckets as if they were returned in a single query
     return Aggregations(_search=s,
                         data={agg_name: {
                             "buckets": all_buckets
                         }})
예제 #8
0
    def test_parse_as_dataframe(self):
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS)
        df = Aggregations(data=sample.ES_AGG_RESPONSE,
                          _search=Search().aggs(my_agg)).to_dataframe(
                              grouped_by="global_metrics.field.name")
        self.assertIsInstance(df, pd.DataFrame)
        self.assertEqual(set(df.index.names),
                         {"classification_type", "global_metrics.field.name"})
        self.assertEqual(set(df.columns),
                         {"avg_f1_micro", "avg_nb_classes", "doc_count"})

        self.assertEqual(
            df.to_dict(orient="index"),
            {
                ("multiclass", "gpc"): {
                    "avg_f1_micro": 0.93,
                    "avg_nb_classes": 211.12,
                    "doc_count": 198,
                },
                ("multiclass", "kind"): {
                    "avg_f1_micro": 0.89,
                    "avg_nb_classes": 206.5,
                    "doc_count": 370,
                },
                ("multilabel", "ispracticecompatible"): {
                    "avg_f1_micro": 0.72,
                    "avg_nb_classes": 18.71,
                    "doc_count": 128,
                },
                ("multilabel", "preservationmethods"): {
                    "avg_f1_micro": 0.8,
                    "avg_nb_classes": 9.97,
                    "doc_count": 76,
                },
            },
        )
예제 #9
0
    def test_grouping_agg(self):
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS)
        agg_response = Aggregations(data=sample.ES_AGG_RESPONSE,
                                    _search=Search().aggs(my_agg))

        # none provided
        self.assertIsNone(agg_response._grouping_agg()[0])
        # fake provided
        with self.assertRaises(KeyError):
            agg_response._grouping_agg("yolo")
        # not bucket provided
        with self.assertRaises(ValueError):
            agg_response._grouping_agg("avg_f1_micro")
        # real provided
        self.assertEqual(
            agg_response._grouping_agg("global_metrics.field.name")[0],
            "global_metrics.field.name",
        )
예제 #10
0
    def test_parse_as_tabular_multiple_roots(self):
        # with multiple aggs at root
        my_agg = Aggs({
            "classification_type": {
                "terms": {
                    "field": "classification_type"
                }
            },
            "avg_f1_score": {
                "avg": {
                    "field": "global_metrics.performance.test.micro.f1_score"
                }
            },
        })

        raw_response = {
            "classification_type": {
                "doc_count_error_upper_bound":
                0,
                "sum_other_doc_count":
                0,
                "buckets": [
                    {
                        "key": "multiclass",
                        "doc_count": 439
                    },
                    {
                        "key": "multilabel",
                        "doc_count": 433
                    },
                ],
            },
            "avg_f1_score": {
                "value": 0.815
            },
        }
        index_names, index_values = Aggregations(
            data=raw_response,
            _search=Search().aggs(my_agg)).to_tabular(index_orient=True,
                                                      expand_sep=" || ")

        self.assertEqual(index_names, [])
        self.assertEqual(
            index_values,
            {
                (): {
                    "avg_f1_score": 0.815,
                    "classification_type || multiclass": 439,
                    "classification_type || multilabel": 433,
                }
            },
        )

        # with specified grouped_by
        index_names, index_values = Aggregations(
            data=raw_response, _search=Search().aggs(my_agg)).to_tabular(
                grouped_by="classification_type")
        self.assertEqual(index_names, ["classification_type"])
        self.assertEqual(
            index_values,
            {
                ("multiclass", ): {
                    "doc_count": 439
                },
                ("multilabel", ): {
                    "doc_count": 433
                }
            },
        )
예제 #11
0
    def test_parse_as_tabular(self):
        # with single agg at root
        my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS)
        index_names, index_values = Aggregations(
            data=sample.ES_AGG_RESPONSE,
            _search=Search().aggs(my_agg)).to_tabular(
                index_orient=True, grouped_by="global_metrics.field.name")

        self.assertEqual(index_names,
                         ["classification_type", "global_metrics.field.name"])
        self.assertEqual(
            index_values,
            {
                ("multilabel", "ispracticecompatible"): {
                    "avg_f1_micro": 0.72,
                    "avg_nb_classes": 18.71,
                    "doc_count": 128,
                },
                ("multilabel", "preservationmethods"): {
                    "avg_f1_micro": 0.8,
                    "avg_nb_classes": 9.97,
                    "doc_count": 76,
                },
                ("multiclass", "kind"): {
                    "avg_f1_micro": 0.89,
                    "avg_nb_classes": 206.5,
                    "doc_count": 370,
                },
                ("multiclass", "gpc"): {
                    "avg_f1_micro": 0.93,
                    "avg_nb_classes": 211.12,
                    "doc_count": 198,
                },
            },
        )

        # index_orient = False
        index_names, index_values = Aggregations(
            data=sample.ES_AGG_RESPONSE,
            _search=Search().aggs(my_agg)).to_tabular(
                index_orient=False, grouped_by="global_metrics.field.name")

        self.assertEqual(index_names,
                         ["classification_type", "global_metrics.field.name"])
        self.assertEqual(
            index_values,
            [
                {
                    "avg_f1_micro": 0.72,
                    "avg_nb_classes": 18.71,
                    "classification_type": "multilabel",
                    "doc_count": 128,
                    "global_metrics.field.name": "ispracticecompatible",
                },
                {
                    "avg_f1_micro": 0.8,
                    "avg_nb_classes": 9.97,
                    "classification_type": "multilabel",
                    "doc_count": 76,
                    "global_metrics.field.name": "preservationmethods",
                },
                {
                    "avg_f1_micro": 0.89,
                    "avg_nb_classes": 206.5,
                    "classification_type": "multiclass",
                    "doc_count": 370,
                    "global_metrics.field.name": "kind",
                },
                {
                    "avg_f1_micro": 0.93,
                    "avg_nb_classes": 211.12,
                    "classification_type": "multiclass",
                    "doc_count": 198,
                    "global_metrics.field.name": "gpc",
                },
            ],
        )
예제 #12
0
 def test_normalize_buckets(self):
     my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS)
     response = Aggregations(data=sample.ES_AGG_RESPONSE,
                             _search=Search().aggs(my_agg)).to_normalized()
     self.assertEqual(ordered(response),
                      ordered(sample.EXPECTED_NORMALIZED_RESPONSE))