예제 #1
0
        def flatten(x, name=""):
            # We flatten into source fields e.g. if type=geo_point
            # location: {lat=52.38, lon=4.90}
            if name == "":
                is_source_field = False
                pd_dtype = "object"
            else:
                try:
                    pd_dtype = field_mapping_cache.field_name_pd_dtype(
                        name[:-1])
                    is_source_field = True
                except KeyError:
                    is_source_field = False
                    pd_dtype = "object"

            if not is_source_field and type(x) is dict:
                for a in x:
                    flatten(x[a], name + a + ".")
            elif not is_source_field and type(x) is list:
                for a in x:
                    flatten(a, name)
            elif is_source_field:  # only print source fields from mappings
                # (TODO - not so efficient for large number of fields and filtered mapping)
                field_name = name[:-1]

                # Coerce types - for now just datetime
                if pd_dtype == "datetime64[ns]":
                    x = elasticsearch_date_to_pandas_date(
                        x, field_mapping_cache.date_field_format(field_name))

                # Elasticsearch can have multiple values for a field. These are represented as lists, so
                # create lists for this pivot (see notes above)
                if field_name in out:
                    if type(out[field_name]) is not list:
                        field_as_list = [out[field_name]]
                        out[field_name] = field_as_list
                    out[field_name].append(x)
                else:
                    out[field_name] = x
            else:
                # Script fields end up here

                # Elasticsearch returns 'Infinity' as a string for np.inf values.
                # Map this to a numeric value to avoid this whole Series being classed as an object
                # TODO - create a lookup for script fields and dtypes to only map 'Infinity'
                #        if the field is numeric. This implementation will currently map
                #        any script field with "Infinity" as a string to np.inf
                if x == "Infinity":
                    out[name[:-1]] = np.inf
                else:
                    out[name[:-1]] = x
예제 #2
0
    def _metric_aggs(self,
                     query_compiler: "QueryCompiler",
                     pd_aggs,
                     numeric_only=True):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        results = {}
        fields = query_compiler._mappings.all_source_fields()
        if numeric_only:
            fields = [
                field for field in fields
                if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        for field in fields:
            for es_agg in es_aggs:
                if not field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{field.es_field_name}",
                        es_agg[0],
                        field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{field.es_field_name}",
                        es_agg,
                        field.aggregatable_es_field_name,
                    )

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        """
        Results are like (for 'sum', 'min')

             AvgTicketPrice  DistanceKilometers  DistanceMiles  FlightDelayMin
        sum    8.204365e+06        9.261629e+07   5.754909e+07          618150
        min    1.000205e+02        0.000000e+00   0.000000e+00               0
        """
        for field in fields:
            values = []
            for es_agg, pd_agg in zip(es_aggs, pd_aggs):

                # If the field and agg aren't compatible we add a NaN
                if not field.is_es_agg_compatible(es_agg):
                    values.append(np.float64(np.NaN))
                    continue

                if isinstance(es_agg, tuple):
                    agg_value = response["aggregations"][
                        f"{es_agg[0]}_{field.es_field_name}"]

                    # Pull multiple values from 'percentiles' result.
                    if es_agg[0] == "percentiles":
                        agg_value = agg_value["values"]

                    agg_value = agg_value[es_agg[1]]

                    # Need to convert 'Population' stddev and variance
                    # from Elasticsearch into 'Sample' stddev and variance
                    # which is what pandas uses.
                    if es_agg[1] in ("std_deviation", "variance"):
                        # Neither transformation works with count <=1
                        count = response["aggregations"][
                            f"{es_agg[0]}_{field.es_field_name}"]["count"]

                        # All of the below calculations result in NaN if count<=1
                        if count <= 1:
                            agg_value = np.float64(np.NaN)

                        elif es_agg[1] == "std_deviation":
                            agg_value *= count / (count - 1.0)

                        else:  # es_agg[1] == "variance"
                            # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # sample_std=\sqrt{\frac{N}{N-1}population_std}
                            agg_value = np.sqrt((count / (count - 1.0)) *
                                                agg_value * agg_value)
                else:
                    agg_value = response["aggregations"][
                        f"{es_agg}_{field.es_field_name}"]
                    if "value_as_string" in agg_value and field.is_timestamp:
                        agg_value = elasticsearch_date_to_pandas_date(
                            agg_value["value_as_string"], field.es_date_format)
                    else:
                        agg_value = agg_value["value"]

                # These aggregations maintain the column datatype
                if pd_agg in ("max", "min"):
                    agg_value = field.np_dtype.type(agg_value)

                # Null usually means there were no results.
                if agg_value is None:
                    agg_value = np.float64(np.NaN)

                values.append(agg_value)

            results[field.index] = values if len(values) > 1 else values[0]

        return results
예제 #3
0
    def _unpack_metric_aggs(
        self,
        fields: List["Field"],
        es_aggs: Union[List[str], List[Tuple[str, str]]],
        pd_aggs: List[str],
        response: Dict[str, Any],
        numeric_only: Optional[bool],
        is_dataframe_agg: bool = False,
    ):
        """
        This method unpacks metric aggregations JSON response.
        This can be called either directly on an aggs query
        or on an individual bucket within a composite aggregation.

        Parameters
        ----------
        fields:
            a list of Field Mappings
        es_aggs:
            Eland Equivalent of aggs
        pd_aggs:
            a list of aggs
        response:
            a dict containing response from Elastic Search
        numeric_only:
            return either numeric values or NaN/NaT

        Returns
        -------
            a dictionary on which agg caluculations are done.
        """
        results: Dict[str, Any] = {}

        for field in fields:
            values = []
            for es_agg, pd_agg in zip(es_aggs, pd_aggs):
                # is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean()
                # If the field and agg aren't compatible we add a NaN/NaT for agg
                # If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean()
                if not field.is_es_agg_compatible(es_agg):
                    if is_dataframe_agg and not numeric_only:
                        values.append(field.nan_value)
                    elif not is_dataframe_agg and numeric_only is False:
                        values.append(field.nan_value)
                    # Explicit condition for mad to add NaN because it doesn't support bool
                    elif is_dataframe_agg and numeric_only:
                        if pd_agg == "mad":
                            values.append(field.nan_value)
                    continue

                if isinstance(es_agg, tuple):
                    agg_value = response["aggregations"][
                        f"{es_agg[0]}_{field.es_field_name}"]

                    # Pull multiple values from 'percentiles' result.
                    if es_agg[0] == "percentiles":
                        agg_value = agg_value["values"]

                    agg_value = agg_value[es_agg[1]]

                    # Need to convert 'Population' stddev and variance
                    # from Elasticsearch into 'Sample' stddev and variance
                    # which is what pandas uses.
                    if es_agg[1] in ("std_deviation", "variance"):
                        # Neither transformation works with count <=1
                        count = response["aggregations"][
                            f"{es_agg[0]}_{field.es_field_name}"]["count"]

                        # All of the below calculations result in NaN if count<=1
                        if count <= 1:
                            agg_value = np.NaN

                        elif es_agg[1] == "std_deviation":
                            agg_value *= count / (count - 1.0)

                        else:  # es_agg[1] == "variance"
                            # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # sample_std=\sqrt{\frac{N}{N-1}population_std}
                            agg_value = np.sqrt((count / (count - 1.0)) *
                                                agg_value * agg_value)
                else:
                    agg_value = response["aggregations"][
                        f"{es_agg}_{field.es_field_name}"]["value"]

                # Null usually means there were no results.
                if agg_value is None or np.isnan(agg_value):
                    if is_dataframe_agg and not numeric_only:
                        agg_value = np.NaN
                    elif not is_dataframe_agg and numeric_only is False:
                        agg_value = np.NaN

                # Cardinality is always either NaN or integer.
                elif pd_agg == "nunique":
                    agg_value = int(agg_value)

                # If this is a non-null timestamp field convert to a pd.Timestamp()
                elif field.is_timestamp:
                    agg_value = elasticsearch_date_to_pandas_date(
                        agg_value, field.es_date_format)
                # If numeric_only is False | None then maintain column datatype
                elif not numeric_only:
                    # we're only converting to bool for lossless aggs like min, max, and median.
                    if pd_agg in {"max", "min", "median", "sum"}:
                        # 'sum' isn't representable with bool, use int64
                        if pd_agg == "sum" and field.is_bool:
                            agg_value = np.int64(agg_value)
                        else:
                            agg_value = field.np_dtype.type(agg_value)

                values.append(agg_value)

            # If numeric_only is True and We only have a NaN type field then we check for empty.
            if values:
                results[
                    field.column] = values if len(values) > 1 else values[0]

        return results