Пример #1
0
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = self.edge.value
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        exists = AndOp("and", [
            InOp("in", [value, Literal("literal", include)])
        ]).partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if isinstance(value, Variable):
            es_field = first(self.query.frum.schema.leaves(value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match",
                {
                    "field": es_field,
                    "size": limit,
                    "order": {"_term": self.sorted} if self.sorted else None
                },
                self
            )
        else:
            match = TermsAggs(
                "_match",
                {
                    "script": {
                        "lang": "painless",
                        "inline": value.to_es_script(self.schema).script(self.schema)
                    },
                    "size": limit
                },
                self
            )
        output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # FIND NULLS AT EACH NESTED LEVEL
            for p in self.schema.query_path:
                if p == query_path:
                    # MISSING AT THE QUERY DEPTH
                    output.add(
                        NestedAggs(p).add(FilterAggs("_missing0", NotOp(None, exists), self).add(es_query))
                    )
                else:
                    # PARENT HAS NO CHILDREN, SO MISSING
                    column = first(self.schema.values(query_path, (OBJECT, EXISTS)))
                    output.add(
                        NestedAggs(column.nested_path[0]).add(
                            FilterAggs(
                                "_missing1",
                                NotOp(None, ExistsOp(None, Variable(column.es_column.replace(NESTED_TYPE, EXISTS_TYPE)))),
                                self
                            ).add(es_query)
                        )
                    )
        return output
Пример #2
0
def build_es_query(select, query_path, schema, query):
    acc = extract_aggs(select, query_path, schema)
    acc = NestedAggs(query_path).add(acc)
    split_decoders = get_decoders_by_path(query)
    split_wheres = split_expression_by_path(query.where,
                                            schema=schema,
                                            lang=ES52)
    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    paths = list(
        reversed(sorted(set(split_wheres.keys())
                        | set(split_decoders.keys()))))
    for path in paths:
        decoder = split_decoders.get(path, Null)
        where = split_wheres.get(path, Null)

        for d in decoder:
            decoders[d.edge.dim] = d
            acc = d.append_query(path, acc)
            start += d.num_columns

        if where:
            acc = FilterAggs("_filter", AndOp(where), None).add(acc)
        acc = NestedAggs(path).add(acc)
    acc = NestedAggs('.').add(acc)
    acc = simplify(acc)
    es_query = wrap(acc.to_es(schema))
    es_query.size = 0
    return acc, decoders, es_query
Пример #3
0
def aggop_to_es_queries(select, query_path, schema, query):
    base_agg = extract_aggs(select, query_path, schema)
    base_agg = NestedAggs(query_path).add(base_agg)

    all_paths, split_decoders, var_to_columns = pre_process(query)

    # WE LET EACH DIMENSION ADD ITS OWN CODE FOR HANDLING INNER JOINS
    concat_outer = query_to_outer_joins(query, all_paths, {}, var_to_columns)

    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    output = NestedAggs(".")
    for i, outer in enumerate(concat_outer.terms):
        acc = base_agg
        for p, path in enumerate(all_paths):
            decoder = split_decoders.get(path, Null)

            for d in decoder:
                decoders[d.edge.dim] = d
                acc = d.append_query(path, acc)
                start += d.num_columns

            where = first(nest.where for nest in outer.nests if nest.path == path).partial_eval()
            if where is FALSE:
                continue
            elif not where or where is TRUE:
                pass
            else:
                acc = FilterAggs("_filter" + text(i) + text(p), where, None).add(acc)
            acc = NestedAggs(path).add(acc)
        output.add(acc)
    output = simplify(output)
    es_query = to_data(output.to_es(schema))
    es_query.size = 0
    return output, decoders, es_query
Пример #4
0
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = self.edge.value
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        schema = self.schema
        exists = InOp([value, Literal(include)]).partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if is_op(value, Variable):
            es_field = first(schema.leaves(value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match",
                {
                    "field": es_field,
                    "size": limit,
                    "order": {"_term": self.sorted} if self.sorted else None
                },
                self
            )
        else:
            match = TermsAggs(
                "_match",
                {
                    "script": text(Painless[value].to_es_script(schema)),
                    "size": limit
                },
                self
            )
        output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # IF ALL NESTED COLUMNS ARE NULL, DOES THE FILTER PASS?
            # MISSING AT THE QUERY DEPTH
            # columns = schema[value.var]
            concat_inner = split_expression(NotOp(exists), self.query)
            for i, term in enumerate(concat_inner.terms):
                acc = es_query
                for nest in term.nests:
                    if nest.where is not TRUE:
                        acc = NestedAggs(nest.path.var).add(FilterAggs("_missing" + text(i), nest.where, self).add(acc))
                output.add(acc)
        return output
Пример #5
0
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = Painless[self.edge.value]
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        schema = self.schema
        exists = Painless[AndOp([InOp([value,
                                       Literal(include)])])].partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if is_op(value, Variable):
            es_field = first(schema.leaves(
                value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match", {
                    "field": es_field,
                    "size": limit,
                    "order": {
                        "_term": self.sorted
                    } if self.sorted else None
                }, self)
        else:
            match = TermsAggs("_match", {
                "script": text(value.to_es_script(schema)),
                "size": limit
            }, self)
        output = Aggs().add(
            FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # IF ALL NESTED COLUMNS ARE NULL, DOES THE FILTER PASS?
            # MISSING AT THE QUERY DEPTH
            op, split = split_expression_by_path(NotOp(exists), schema)
            for i, p in enumerate(reversed(sorted(split.keys()))):
                e = split.get(p)
                if e:
                    not_match = NestedAggs(p).add(
                        FilterAggs("_missing" + text(i), e,
                                   self).add(es_query))
                    output.add(not_match)
        return output
Пример #6
0
def agg_formula(acc, formula, query_path, schema):
    # DUPLICATED FOR SCRIPTS, MAYBE THIS CAN BE PUT INTO A LANGUAGE?
    for i, s in enumerate(formula):
        canonical_name = s.name
        s_path = [
            k for k, v in split_expression_by_path(
                s.value, schema=schema, lang=Painless).items() if v
        ]
        if len(s_path) == 0:
            # FOR CONSTANTS
            nest = NestedAggs(query_path)
            acc.add(nest)
        elif len(s_path) == 1:
            nest = NestedAggs(first(s_path))
            acc.add(nest)
        else:
            raise Log.error("do not know how to handle")

        if is_op(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = jx_expression_to_function("doc_count")
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = Painless[TupleOp(
                    [NULL] *
                    len(s.value.terms))].partial_eval().to_es_script(schema)
                selfy = text(
                    Painless[s.value].partial_eval().to_es_script(schema))

                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.best = ' + nully + '.toArray();',
                        'map_script':
                        'params._agg.best = ' + expand_template(
                            MAX_OF_TUPLE, {
                                "expr1": "params._agg.best",
                                "expr2": selfy,
                                "dir": dir,
                                "op": op
                            }) + ";",
                        'combine_script':
                        'return params._agg.best',
                        'reduce_script':
                        'return params._aggs.stream().' + op + '(' +
                        expand_template(COMPARE_TUPLE, {
                            "dir": dir,
                            "op": op
                        }) + ').get()',
                    }
                }
                nest.add(
                    NestedAggs(query_path).add(
                        ExprAggs(canonical_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple",
                          agg=s.aggregate)
        elif s.aggregate == "count":
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "value_count": {
                            "script":
                            text(Painless[s.value].partial_eval().to_es_script(
                                schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function(join_field(["50.0"]))
        elif s.aggregate in ("and", "or"):
            key = literal_field(canonical_name + " " + s.aggregate)
            op = aggregates[s.aggregate]
            nest.add(
                ExprAggs(
                    key, {
                        op: {
                            "script":
                            text(Painless[NumberOp(
                                s.value)].to_es_script(schema))
                        }
                    }, s))
            # get_name = concat_field(canonical_name, "value")
            s.pull = jx_expression_to_function({
                "case": [{
                    "when": {
                        "eq": {
                            "value": 1
                        }
                    },
                    "then": True
                }, {
                    "when": {
                        "eq": {
                            "value": 0
                        }
                    },
                    "then": False
                }]
            })
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = mo_math.round(s.percentile * 100, decimal=6)
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [percent]
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text(percent)]))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"
            nest.add(
                ExprAggs(
                    key, {
                        "cardinality": {
                            "script": text(
                                Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script": text(
                                Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            nest.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, select_median))
            s.pull = get_pull_stats()
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            nest.add(
                TermsAggs(canonical_name, {
                    "script_field":
                    text(Painless[s.value].to_es_script(schema))
                }, s))
            s.pull = jx_expression_to_function("key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(aggregates[s.aggregate])
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text(
                                NumberOp(s.value).partial_eval().to_es_script(
                                    schema))
                        }
                    }, s))
Пример #7
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    query_path = schema.query_path[0]
    select = listwrap(query.select)

    new_select = Data(
    )  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if is_op(s.value, Variable_):
            s.query_path = query_path
            if s.aggregate == "count":
                new_select["count_" + literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            split_select = split_expression_by_path(s.value,
                                                    schema,
                                                    lang=Painless)
            for si_key, si_value in split_select.items():
                if si_value:
                    if s.query_path:
                        Log.error(
                            "can not handle more than one depth per select")
                    s.query_path = si_key
            formula.append(s)

    acc = Aggs()
    for _, many in new_select.items():
        for s in many:
            canonical_name = s.name
            if s.aggregate in ("value_count", "count"):
                columns = frum.schema.values(s.value.var,
                                             exclude_type=(OBJECT, NESTED))
            else:
                columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    es_name = column.es_column + "_count"
                    if column.jx_type == EXISTS:
                        if column.nested_path[0] == query_path:
                            canonical_names.append("doc_count")
                            acc.add(
                                NestedAggs(column.nested_path[0]).add(
                                    CountAggs(s)))
                    else:
                        canonical_names.append("value")
                        acc.add(
                            NestedAggs(column.nested_path[0]).add(
                                ExprAggs(es_name, {
                                    "value_count": {
                                        "field": column.es_column
                                    }
                                }, s)))
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function(
                        {"add": canonical_names})
            elif s.aggregate == "median":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform median on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, s))
                s.pull = jx_expression_to_function("values.50\\.0")
            elif s.aggregate == "percentile":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform percentile on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                if is_text(
                        s.percentile) or s.percetile < 0 or 1 < s.percentile:
                    Log.error(
                        "Expecting percentile to be a float from 0.0 to 1.0")
                percent = mo_math.round(s.percentile * 100, decimal=6)

                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [percent],
                                "tdigest": {
                                    "compression": 2
                                }
                            }
                        }, s))
                s.pull = jx_expression_to_function(
                    join_field(["values", text_type(percent)]))
            elif s.aggregate == "cardinality":
                for column in columns:
                    path = column.es_column + "_cardinality"
                    acc.add(
                        ExprAggs(path,
                                 {"cardinality": {
                                     "field": column.es_column
                                 }}, s))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "stats":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform stats on columns with more than one type (script probably)"
                    )
                # REGULAR STATS
                acc.add(
                    ExprAggs(canonical_name, {
                        "extended_stats": {
                            "field": first(columns).es_column
                        }
                    }, s))
                s.pull = get_pull_stats()

                # GET MEDIAN TOO!
                select_median = s.copy()
                select_median.pull = jx_expression_to_function(
                    {"select": [{
                        "name": "median",
                        "value": "values.50\\.0"
                    }]})

                acc.add(
                    ExprAggs(
                        canonical_name + "_percentile", {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, select_median))

            elif s.aggregate == "union":
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'init_script':
                            'params._agg.terms = new HashSet()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.add(v);',
                            'combine_script':
                            'return params._agg.terms.toArray()',
                            'reduce_script':
                            'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                        }
                    }
                    stats_name = column.es_column
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "count_values":
                # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS
                # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'params': {
                                "_agg": {}
                            },
                            'init_script':
                            'params._agg.terms = new HashMap()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);',
                            'combine_script':
                            'return params._agg.terms',
                            'reduce_script':
                            '''
                            HashMap output = new HashMap(); 
                            for (agg in params._aggs) {
                                if (agg!=null){
                                    for (e in agg.entrySet()) {
                                        String key = String.valueOf(e.getKey());
                                        output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0));
                                    } 
                                }
                            } 
                            return output;
                        '''
                        }
                    }
                    stats_name = encode_property(column.es_column)
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                if not columns:
                    s.pull = jx_expression_to_function(NULL)
                else:
                    for c in columns:
                        acc.add(
                            NestedAggs(c.nested_path[0]).add(
                                ExprAggs(
                                    canonical_name,
                                    {"extended_stats": {
                                        "field": c.es_column
                                    }}, s)))
                    s.pull = jx_expression_to_function(aggregates[s.aggregate])

    for i, s in enumerate(formula):
        s_path = [
            k for k, v in split_expression_by_path(
                s.value, schema=schema, lang=Painless).items() if v
        ]
        if len(s_path) == 0:
            # FOR CONSTANTS
            nest = NestedAggs(query_path)
            acc.add(nest)
        elif len(s_path) == 1:
            nest = NestedAggs(first(s_path))
            acc.add(nest)
        else:
            Log.error("do not know how to handle")

        canonical_name = s.name
        if is_op(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = jx_expression_to_function("doc_count")
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = Painless[TupleOp(
                    [NULL] *
                    len(s.value.terms))].partial_eval().to_es_script(schema)
                selfy = text_type(
                    Painless[s.value].partial_eval().to_es_script(schema))

                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.best = ' + nully + ';',
                        'map_script':
                        'params._agg.best = ' + expand_template(
                            MAX_OF_TUPLE, {
                                "expr1": "params._agg.best",
                                "expr2": selfy,
                                "dir": dir,
                                "op": op
                            }) + ";",
                        'combine_script':
                        'return params._agg.best',
                        'reduce_script':
                        'return params._aggs.stream().' + op + '(' +
                        expand_template(COMPARE_TUPLE, {
                            "dir": dir,
                            "op": op
                        }) + ').get()',
                    }
                }
                nest.add(
                    NestedAggs(query_path).add(
                        ExprAggs(canonical_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple",
                          agg=s.aggregate)
        elif s.aggregate == "count":
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "value_count": {
                            "script":
                            text_type(Painless[
                                s.value].partial_eval().to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function(join_field(["50.0"]))
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = mo_math.round(s.percentile * 100, decimal=6)
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [percent]
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text_type(percent)]))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"
            nest.add(
                ExprAggs(
                    key, {
                        "cardinality": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            nest.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, select_median))
            s.pull = get_pull_stats()
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            nest.add(
                TermsAggs(
                    canonical_name, {
                        "script_field":
                        text_type(Painless[s.value].to_es_script(schema))
                    }, s))
            s.pull = jx_expression_to_function("key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(aggregates[s.aggregate])
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text_type(
                                NumberOp(s.value).partial_eval().to_es_script(
                                    schema))
                        }
                    }, s))

    acc = NestedAggs(query_path).add(acc)
    split_decoders = get_decoders_by_path(query)
    split_wheres = split_expression_by_path(query.where,
                                            schema=frum.schema,
                                            lang=ES52)

    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    paths = list(reversed(sorted(split_wheres.keys() | split_decoders.keys())))
    for path in paths:
        literal_path = literal_field(path)
        decoder = split_decoders[literal_path]
        where = split_wheres[literal_path]

        for d in decoder:
            decoders[d.edge.dim] = d
            acc = d.append_query(path, acc)
            start += d.num_columns

        if where:
            acc = FilterAggs("_filter", AndOp(where), None).add(acc)
        acc = NestedAggs(path).add(acc)

    acc = NestedAggs('.').add(acc)
    acc = simplify(acc)
    es_query = wrap(acc.to_es(schema))

    es_query.size = 0

    with Timer("ES query time", silent=not DEBUG) as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting", silent=not DEBUG)
        with format_time:
            # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE
            aggs = unwrap(result.aggregations)

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[
                query.format]
            if query.edges:
                output = formatter(aggs, acc, query, decoders, select)
            elif query.groupby:
                output = groupby_formatter(aggs, acc, query, decoders, select)
            else:
                output = aggop_formatter(aggs, acc, query, decoders, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", cause=e)
Пример #8
0
def agg_field(acc, new_select, query_path, schema):
    for s in (s for _, many in new_select.items() for s in many):
        canonical_name = s.name
        if s.aggregate in ("value_count", "count"):
            columns = schema.values(s.value.var, exclude_type=(OBJECT, NESTED))
        else:
            columns = schema.values(s.value.var)

        if s.aggregate == "count":
            canonical_names = []
            for column in columns:
                es_name = column.es_column + "_count"
                if column.jx_type == EXISTS:
                    if column.nested_path[0] == query_path:
                        canonical_names.append("doc_count")
                        acc.add(
                            NestedAggs(column.nested_path[0]).add(
                                CountAggs(s)))
                else:
                    canonical_names.append("value")
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(
                                es_name,
                                {"value_count": {
                                    "field": column.es_column
                                }}, s)))
            if len(canonical_names) == 1:
                s.pull = jx_expression_to_function(canonical_names[0])
            else:
                s.pull = jx_expression_to_function({"add": canonical_names})
        elif s.aggregate == "median":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform median on columns with more than one type (script probably)"
                )
            # ES USES DIFFERENT METHOD FOR PERCENTILES
            key = canonical_name + " percentile"
            acc.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function("values.50\\.0")
        elif s.aggregate in ("and", "or"):
            columns = [c for c in columns if c.jx_type is BOOLEAN]
            op = aggregates[s.aggregate]
            if not columns:
                s.pull = jx_expression_to_function(NULL)
            else:
                for c in columns:
                    acc.add(
                        NestedAggs(c.nested_path[0]).add(
                            ExprAggs(canonical_name,
                                     {op: {
                                         "field": c.es_column
                                     }}, s)))
                # get_name = concat_field(canonical_name, "value")
                s.pull = jx_expression_to_function({
                    "case": [{
                        "when": {
                            "eq": {
                                "value": 1
                            }
                        },
                        "then": True
                    }, {
                        "when": {
                            "eq": {
                                "value": 0
                            }
                        },
                        "then": False
                    }]
                })
        elif s.aggregate == "percentile":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform percentile on columns with more than one type (script probably)"
                )
            # ES USES DIFFERENT METHOD FOR PERCENTILES
            key = canonical_name + " percentile"
            if is_text(s.percentile) or s.percetile < 0 or 1 < s.percentile:
                Log.error("Expecting percentile to be a float from 0.0 to 1.0")
            percent = mo_math.round(s.percentile * 100, decimal=6)

            acc.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [percent],
                            "tdigest": {
                                "compression": 2
                            }
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text(percent)]))
        elif s.aggregate == "cardinality":
            for column in columns:
                path = column.es_column + "_cardinality"
                acc.add(
                    ExprAggs(path,
                             {"cardinality": {
                                 "field": column.es_column
                             }}, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform stats on columns with more than one type (script probably)"
                )
            # REGULAR STATS
            acc.add(
                ExprAggs(
                    canonical_name,
                    {"extended_stats": {
                        "field": first(columns).es_column
                    }}, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            acc.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [50]
                        }
                    }, select_median))

        elif s.aggregate == "union":
            for column in columns:
                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.terms = new HashSet()',
                        'map_script':
                        'for (v in doc[' + quote(column.es_column) +
                        '].values) params._agg.terms.add(v);',
                        'combine_script':
                        'return params._agg.terms.toArray()',
                        'reduce_script':
                        'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                    }
                }
                stats_name = column.es_column
                acc.add(
                    NestedAggs(column.nested_path[0]).add(
                        ExprAggs(stats_name, script, s)))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "count_values":
            # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS
            # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD
            for column in columns:
                script = {
                    "scripted_metric": {
                        'params': {
                            "_agg": {}
                        },
                        'init_script':
                        'params._agg.terms = new HashMap()',
                        'map_script':
                        'for (v in doc[' + quote(column.es_column) +
                        '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);',
                        'combine_script':
                        'return params._agg.terms',
                        'reduce_script':
                        '''
                        HashMap output = new HashMap(); 
                        for (agg in params._aggs) {
                            if (agg!=null){
                                for (e in agg.entrySet()) {
                                    String key = String.valueOf(e.getKey());
                                    output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0));
                                } 
                            }
                        } 
                        return output;
                    '''
                    }
                }
                stats_name = encode_property(column.es_column)
                acc.add(
                    NestedAggs(column.nested_path[0]).add(
                        ExprAggs(stats_name, script, s)))
            s.pull = jx_expression_to_function("value")
        else:
            if not columns:
                s.pull = jx_expression_to_function(NULL)
            else:
                for c in columns:
                    acc.add(
                        NestedAggs(c.nested_path[0]).add(
                            ExprAggs(
                                canonical_name,
                                {"extended_stats": {
                                    "field": c.es_column
                                }}, s)))
                s.pull = jx_expression_to_function(aggregates[s.aggregate])