Пример #1
0
 def _dict2json(self, value, sub_schema, path, net_new_properties, _buffer):
     prefix = '{'
     for k, v in ((kk, value[kk]) for kk in sorted(value.keys())):
         if v == None or v == '':
             continue
         append(_buffer, prefix)
         prefix = COMMA
         if isinstance(k, binary_type):
             k = utf82unicode(k)
         if not isinstance(k, text_type):
             Log.error("Expecting property name to be a string")
         if k not in sub_schema:
             sub_schema[k] = {}
             net_new_properties.append(path+[k])
         append(_buffer, json.dumps(encode_property(k)))
         append(_buffer, COLON)
         self._typed_encode(v, sub_schema[k], path+[k], net_new_properties, _buffer)
     if prefix == COMMA:
         append(_buffer, COMMA+QUOTED_EXISTS_TYPE+COLON+'1}')
     else:
         append(_buffer, '{'+QUOTED_EXISTS_TYPE+COLON+'0}')
Пример #2
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    select = listwrap(query.select)

    es_query = Data()
    new_select = Data()  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            if schema.query_path == ".":
                s.pull = jx_expression_to_function("doc_count")
            else:
                s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]})
        elif isinstance(s.value, Variable):
            if s.aggregate == "count":
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            formula.append(s)

    for canonical_name, many in new_select.items():
        for s in many:
            columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_count")
                    if column.jx_type == EXISTS:
                        canonical_names.append(cn + ".doc_count")
                        es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
                    else:
                        canonical_names.append(cn+ ".value")
                        es_query.aggs[cn].value_count.field = column.es_column
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function({"add": canonical_names})
            elif s.aggregate == "median":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = jx_expression_to_function(key + ".values.50\\.0")
            elif s.aggregate == "percentile":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [percent]
                es_query.aggs[key].percentiles.tdigest.compression = 2
                s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
            elif s.aggregate == "cardinality":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_cardinality")
                    canonical_names.append(cn)
                    es_query.aggs[cn].cardinality.field = column.es_column
                if len(columns) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0] + ".value")
                else:
                    s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
            elif s.aggregate == "stats":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = columns[0].es_column

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + "_percentile")
                es_query.aggs[median_name].percentiles.field = columns[0].es_column
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = get_pull_stats(stats_name, median_name)
            elif s.aggregate == "union":
                pulls = []
                for column in columns:
                    script = {"scripted_metric": {
                        'init_script': 'params._agg.terms = new HashSet()',
                        'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v);',
                        'combine_script': 'return params._agg.terms.toArray()',
                        'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                    }}
                    stats_name = encode_property(column.es_column)
                    if column.nested_path[0] == ".":
                        es_query.aggs[stats_name] = script
                        pulls.append(jx_expression_to_function(stats_name + ".value"))
                    else:
                        es_query.aggs[stats_name] = {
                            "nested": {"path": column.nested_path[0]},
                            "aggs": {"_nested": script}
                        }
                        pulls.append(jx_expression_to_function(stats_name + "._nested.value"))

                if len(pulls) == 0:
                    s.pull = NULL
                elif len(pulls) == 1:
                    s.pull = pulls[0]
                else:
                    s.pull = lambda row: UNION(p(row) for p in pulls)
            else:
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                elif len(columns) <1:
                    # PULL VALUE OUT OF THE stats AGGREGATE
                    s.pull = jx_expression_to_function({"null":{}})
                else:
                    # PULL VALUE OUT OF THE stats AGGREGATE
                    es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column
                    s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)

        if isinstance(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr
                selfy = s.value.partial_eval().to_es_script(schema).expr

                script = {"scripted_metric": {
                    'init_script': 'params._agg.best = ' + nully + ';',
                    'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";",
                    'combine_script': 'return params._agg.best',
                    'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()',
                }}
                if schema.query_path[0] == ".":
                    es_query.aggs[canonical_name] = script
                    s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
                else:
                    es_query.aggs[canonical_name] = {
                        "nested": {"path": schema.query_path[0]},
                        "aggs": {"_nested": script}
                    }
                    s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value")
            else:
               Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = jx_expression_to_function(key + ".values.50\\.0")
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(key + ".value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = get_pull_stats(stats_name, median_name)
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(stats_name + ".buckets.key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
            es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

    decoders = get_decoders_by_depth(query)
    start = 0

    # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = AndOp("and", split_where[1]).to_esfilter(schema)
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {"nested": {"path": schema.query_path[0]}},
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = AndOp("and", split_where[0]).to_esfilter(schema)
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", cause=e)
Пример #3
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    select = listwrap(query.select)

    es_query = Data()
    new_select = Data()  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            if schema.query_path == ".":
                s.pull = jx_expression_to_function("doc_count")
            else:
                s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]})
        elif isinstance(s.value, Variable):
            if s.aggregate == "count":
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            formula.append(s)

    for canonical_name, many in new_select.items():
        for s in many:
            columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_count")
                    if column.jx_type == EXISTS:
                        canonical_names.append(cn + ".doc_count")
                        es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
                    else:
                        canonical_names.append(cn+ ".value")
                        es_query.aggs[cn].value_count.field = column.es_column
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function({"add": canonical_names})
            elif s.aggregate == "median":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = jx_expression_to_function(key + ".values.50\\.0")
            elif s.aggregate == "percentile":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [percent]
                s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
            elif s.aggregate == "cardinality":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_cardinality")
                    canonical_names.append(cn)
                    es_query.aggs[cn].cardinality.field = column.es_column
                if len(columns) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0] + ".value")
                else:
                    s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
            elif s.aggregate == "stats":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = columns[0].es_column

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + "_percentile")
                es_query.aggs[median_name].percentiles.field = columns[0].es_column
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = get_pull_stats(stats_name, median_name)
            elif s.aggregate == "union":
                pulls = []
                for column in columns:
                    script = {"scripted_metric": {
                        'init_script': 'params._agg.terms = new HashSet()',
                        'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v)',
                        'combine_script': 'return params._agg.terms.toArray()',
                        'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                    }}
                    stats_name = encode_property(column.es_column)
                    if column.nested_path[0] == ".":
                        es_query.aggs[stats_name] = script
                        pulls.append(jx_expression_to_function(stats_name + ".value"))
                    else:
                        es_query.aggs[stats_name] = {
                            "nested": {"path": column.nested_path[0]},
                            "aggs": {"_nested": script}
                        }
                        pulls.append(jx_expression_to_function(stats_name + "._nested.value"))

                if len(pulls) == 0:
                    s.pull = NULL
                elif len(pulls) == 1:
                    s.pull = pulls[0]
                else:
                    s.pull = lambda row: UNION(p(row) for p in pulls)
            else:
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                elif len(columns) <1:
                    # PULL VALUE OUT OF THE stats AGGREGATE
                    s.pull = jx_expression_to_function({"null":{}})
                else:
                    # PULL VALUE OUT OF THE stats AGGREGATE
                    es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column
                    s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)

        if isinstance(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr
                selfy = s.value.partial_eval().to_es_script(schema).expr

                script = {"scripted_metric": {
                    'init_script': 'params._agg.best = ' + nully + ';',
                    'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";",
                    'combine_script': 'return params._agg.best',
                    'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()',
                }}
                if schema.query_path[0] == ".":
                    es_query.aggs[canonical_name] = script
                    s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
                else:
                    es_query.aggs[canonical_name] = {
                        "nested": {"path": schema.query_path[0]},
                        "aggs": {"_nested": script}
                    }
                    s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value")
            else:
               Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = jx_expression_to_function(key + ".values.50\\.0")
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(key + ".value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = get_pull_stats(stats_name, median_name)
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(stats_name + ".buckets.key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
            es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

    decoders = get_decoders_by_depth(query)
    start = 0

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = AndOp("and", split_where[1]).to_esfilter(schema)
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {"nested": {"path": schema.query_path[0]}},
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = AndOp("and", split_where[0]).to_esfilter(schema)
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", cause=e)
Пример #4
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    query_path = schema.query_path[0]
    select = listwrap(query.select)

    new_select = Data(
    )  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if is_op(s.value, Variable_):
            s.query_path = query_path
            if s.aggregate == "count":
                new_select["count_" + literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            split_select = split_expression_by_path(s.value,
                                                    schema,
                                                    lang=Painless)
            for si_key, si_value in split_select.items():
                if si_value:
                    if s.query_path:
                        Log.error(
                            "can not handle more than one depth per select")
                    s.query_path = si_key
            formula.append(s)

    acc = Aggs()
    for _, many in new_select.items():
        for s in many:
            canonical_name = s.name
            if s.aggregate in ("value_count", "count"):
                columns = frum.schema.values(s.value.var,
                                             exclude_type=(OBJECT, NESTED))
            else:
                columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    es_name = column.es_column + "_count"
                    if column.jx_type == EXISTS:
                        if column.nested_path[0] == query_path:
                            canonical_names.append("doc_count")
                            acc.add(
                                NestedAggs(column.nested_path[0]).add(
                                    CountAggs(s)))
                    else:
                        canonical_names.append("value")
                        acc.add(
                            NestedAggs(column.nested_path[0]).add(
                                ExprAggs(es_name, {
                                    "value_count": {
                                        "field": column.es_column
                                    }
                                }, s)))
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function(
                        {"add": canonical_names})
            elif s.aggregate == "median":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform median on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, s))
                s.pull = jx_expression_to_function("values.50\\.0")
            elif s.aggregate == "percentile":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform percentile on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                if is_text(
                        s.percentile) or s.percetile < 0 or 1 < s.percentile:
                    Log.error(
                        "Expecting percentile to be a float from 0.0 to 1.0")
                percent = mo_math.round(s.percentile * 100, decimal=6)

                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [percent],
                                "tdigest": {
                                    "compression": 2
                                }
                            }
                        }, s))
                s.pull = jx_expression_to_function(
                    join_field(["values", text_type(percent)]))
            elif s.aggregate == "cardinality":
                for column in columns:
                    path = column.es_column + "_cardinality"
                    acc.add(
                        ExprAggs(path,
                                 {"cardinality": {
                                     "field": column.es_column
                                 }}, s))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "stats":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform stats on columns with more than one type (script probably)"
                    )
                # REGULAR STATS
                acc.add(
                    ExprAggs(canonical_name, {
                        "extended_stats": {
                            "field": first(columns).es_column
                        }
                    }, s))
                s.pull = get_pull_stats()

                # GET MEDIAN TOO!
                select_median = s.copy()
                select_median.pull = jx_expression_to_function(
                    {"select": [{
                        "name": "median",
                        "value": "values.50\\.0"
                    }]})

                acc.add(
                    ExprAggs(
                        canonical_name + "_percentile", {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, select_median))

            elif s.aggregate == "union":
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'init_script':
                            'params._agg.terms = new HashSet()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.add(v);',
                            'combine_script':
                            'return params._agg.terms.toArray()',
                            'reduce_script':
                            'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                        }
                    }
                    stats_name = column.es_column
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "count_values":
                # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS
                # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'params': {
                                "_agg": {}
                            },
                            'init_script':
                            'params._agg.terms = new HashMap()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);',
                            'combine_script':
                            'return params._agg.terms',
                            'reduce_script':
                            '''
                            HashMap output = new HashMap(); 
                            for (agg in params._aggs) {
                                if (agg!=null){
                                    for (e in agg.entrySet()) {
                                        String key = String.valueOf(e.getKey());
                                        output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0));
                                    } 
                                }
                            } 
                            return output;
                        '''
                        }
                    }
                    stats_name = encode_property(column.es_column)
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                if not columns:
                    s.pull = jx_expression_to_function(NULL)
                else:
                    for c in columns:
                        acc.add(
                            NestedAggs(c.nested_path[0]).add(
                                ExprAggs(
                                    canonical_name,
                                    {"extended_stats": {
                                        "field": c.es_column
                                    }}, s)))
                    s.pull = jx_expression_to_function(aggregates[s.aggregate])

    for i, s in enumerate(formula):
        s_path = [
            k for k, v in split_expression_by_path(
                s.value, schema=schema, lang=Painless).items() if v
        ]
        if len(s_path) == 0:
            # FOR CONSTANTS
            nest = NestedAggs(query_path)
            acc.add(nest)
        elif len(s_path) == 1:
            nest = NestedAggs(first(s_path))
            acc.add(nest)
        else:
            Log.error("do not know how to handle")

        canonical_name = s.name
        if is_op(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = jx_expression_to_function("doc_count")
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = Painless[TupleOp(
                    [NULL] *
                    len(s.value.terms))].partial_eval().to_es_script(schema)
                selfy = text_type(
                    Painless[s.value].partial_eval().to_es_script(schema))

                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.best = ' + nully + ';',
                        'map_script':
                        'params._agg.best = ' + expand_template(
                            MAX_OF_TUPLE, {
                                "expr1": "params._agg.best",
                                "expr2": selfy,
                                "dir": dir,
                                "op": op
                            }) + ";",
                        'combine_script':
                        'return params._agg.best',
                        'reduce_script':
                        'return params._aggs.stream().' + op + '(' +
                        expand_template(COMPARE_TUPLE, {
                            "dir": dir,
                            "op": op
                        }) + ').get()',
                    }
                }
                nest.add(
                    NestedAggs(query_path).add(
                        ExprAggs(canonical_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple",
                          agg=s.aggregate)
        elif s.aggregate == "count":
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "value_count": {
                            "script":
                            text_type(Painless[
                                s.value].partial_eval().to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function(join_field(["50.0"]))
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = mo_math.round(s.percentile * 100, decimal=6)
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [percent]
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text_type(percent)]))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"
            nest.add(
                ExprAggs(
                    key, {
                        "cardinality": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            nest.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "script":
                            text_type(Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, select_median))
            s.pull = get_pull_stats()
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            nest.add(
                TermsAggs(
                    canonical_name, {
                        "script_field":
                        text_type(Painless[s.value].to_es_script(schema))
                    }, s))
            s.pull = jx_expression_to_function("key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(aggregates[s.aggregate])
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text_type(
                                NumberOp(s.value).partial_eval().to_es_script(
                                    schema))
                        }
                    }, s))

    acc = NestedAggs(query_path).add(acc)
    split_decoders = get_decoders_by_path(query)
    split_wheres = split_expression_by_path(query.where,
                                            schema=frum.schema,
                                            lang=ES52)

    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    paths = list(reversed(sorted(split_wheres.keys() | split_decoders.keys())))
    for path in paths:
        literal_path = literal_field(path)
        decoder = split_decoders[literal_path]
        where = split_wheres[literal_path]

        for d in decoder:
            decoders[d.edge.dim] = d
            acc = d.append_query(path, acc)
            start += d.num_columns

        if where:
            acc = FilterAggs("_filter", AndOp(where), None).add(acc)
        acc = NestedAggs(path).add(acc)

    acc = NestedAggs('.').add(acc)
    acc = simplify(acc)
    es_query = wrap(acc.to_es(schema))

    es_query.size = 0

    with Timer("ES query time", silent=not DEBUG) as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting", silent=not DEBUG)
        with format_time:
            # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE
            aggs = unwrap(result.aggregations)

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[
                query.format]
            if query.edges:
                output = formatter(aggs, acc, query, decoders, select)
            elif query.groupby:
                output = groupby_formatter(aggs, acc, query, decoders, select)
            else:
                output = aggop_formatter(aggs, acc, query, decoders, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", cause=e)
Пример #5
0
def extract_rows(es, es_query, query):
    is_list = isinstance(query.select, list)
    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    columns = schema.columns
    leaf_columns = set(
        c.names["."] for c in columns if c.type not in STRUCT and (
            c.nested_path[0] == "." or c.es_column == c.nested_path[0]))
    nested_columns = set(c.names["."] for c in columns
                         if c.nested_path[0] != ".")

    i = 0
    source = "fields"
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp):
            new_name_prefix = select.name + "\\." if select.name != "." else ""
            term = select.value.term
            if isinstance(term, Variable):

                if term.var == ".":
                    es_query.fields = None
                    source = "_source"
                    for cname, cs in schema.lookup.items():
                        for c in cs:
                            if c.type not in STRUCT and c.es_column != "_id":
                                new_name = new_name_prefix + literal_field(
                                    cname)
                                new_select.append({
                                    "name":
                                    new_name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": new_name,
                                        "index": i,
                                        "child": "."
                                    }
                                })
                                i += 1
                else:
                    prefix = term.var + "."
                    prefix_length = len(prefix)
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            suffix = cname[prefix_length:]
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_name = new_name_prefix + literal_field(
                                        suffix)
                                    new_select.append({
                                        "name":
                                        new_name,
                                        "value":
                                        Variable(c.es_column),
                                        "put": {
                                            "name": new_name,
                                            "index": i,
                                            "child": "."
                                        }
                                    })
                                    i += 1

        elif isinstance(select.value, Variable):
            if select.value.var == ".":
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": i,
                        "child": "."
                    }
                })
                i += 1
            elif select.value.var == "_id":
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "pull": "_id",
                    "put": {
                        "name": select.name,
                        "index": i,
                        "child": "."
                    }
                })
                i += 1
            elif select.value.var in nested_columns or [
                    c for c in nested_columns
                    if c.startswith(select.value.var + ".")
            ]:
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": i,
                        "child": "."
                    }
                })
                i += 1
            else:
                prefix = select.value.var + "."
                prefix_length = len(prefix)
                net_columns = [c for c in leaf_columns if c.startswith(prefix)]
                if not net_columns:
                    # LEAF
                    if es_query.fields is not None:
                        es_query.fields.append(
                            encode_property(select.value.var))
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {
                            "name": select.name,
                            "index": i,
                            "child": "."
                        }
                    })
                    i += 1
                else:
                    # LEAVES OF OBJECT
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_select.append({
                                        "name":
                                        select.name,
                                        "value":
                                        Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": i,
                                            "child": cname[prefix_length:]
                                        }
                                    })
                    i += 1
        else:
            es_query.script_fields[literal_field(select.name)] = {
                "script": select.value.to_ruby()
            }
            new_select.append({
                "name": select.name,
                "pull": "fields." + literal_field(select.name),
                "put": {
                    "name": select.name,
                    "index": i,
                    "child": "."
                }
            })
            i += 1

    for n in new_select:
        if n.pull:
            continue
        if source == "_source":
            n.pull = concat_field("_source", n.value.var)
        elif isinstance(n.value, Variable):
            n.pull = concat_field("fields",
                                  literal_field(encode_property(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Пример #6
0
def agg_field(acc, new_select, query_path, schema):
    for s in (s for _, many in new_select.items() for s in many):
        canonical_name = s.name
        if s.aggregate in ("value_count", "count"):
            columns = schema.values(s.value.var, exclude_type=(OBJECT, NESTED))
        else:
            columns = schema.values(s.value.var)

        if s.aggregate == "count":
            canonical_names = []
            for column in columns:
                es_name = column.es_column + "_count"
                if column.jx_type == EXISTS:
                    if column.nested_path[0] == query_path:
                        canonical_names.append("doc_count")
                        acc.add(
                            NestedAggs(column.nested_path[0]).add(
                                CountAggs(s)))
                else:
                    canonical_names.append("value")
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(
                                es_name,
                                {"value_count": {
                                    "field": column.es_column
                                }}, s)))
            if len(canonical_names) == 1:
                s.pull = jx_expression_to_function(canonical_names[0])
            else:
                s.pull = jx_expression_to_function({"add": canonical_names})
        elif s.aggregate == "median":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform median on columns with more than one type (script probably)"
                )
            # ES USES DIFFERENT METHOD FOR PERCENTILES
            key = canonical_name + " percentile"
            acc.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function("values.50\\.0")
        elif s.aggregate in ("and", "or"):
            columns = [c for c in columns if c.jx_type is BOOLEAN]
            op = aggregates[s.aggregate]
            if not columns:
                s.pull = jx_expression_to_function(NULL)
            else:
                for c in columns:
                    acc.add(
                        NestedAggs(c.nested_path[0]).add(
                            ExprAggs(canonical_name,
                                     {op: {
                                         "field": c.es_column
                                     }}, s)))
                # get_name = concat_field(canonical_name, "value")
                s.pull = jx_expression_to_function({
                    "case": [{
                        "when": {
                            "eq": {
                                "value": 1
                            }
                        },
                        "then": True
                    }, {
                        "when": {
                            "eq": {
                                "value": 0
                            }
                        },
                        "then": False
                    }]
                })
        elif s.aggregate == "percentile":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform percentile on columns with more than one type (script probably)"
                )
            # ES USES DIFFERENT METHOD FOR PERCENTILES
            key = canonical_name + " percentile"
            if is_text(s.percentile) or s.percetile < 0 or 1 < s.percentile:
                Log.error("Expecting percentile to be a float from 0.0 to 1.0")
            percent = mo_math.round(s.percentile * 100, decimal=6)

            acc.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [percent],
                            "tdigest": {
                                "compression": 2
                            }
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text(percent)]))
        elif s.aggregate == "cardinality":
            for column in columns:
                path = column.es_column + "_cardinality"
                acc.add(
                    ExprAggs(path,
                             {"cardinality": {
                                 "field": column.es_column
                             }}, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            columns = [c for c in columns if c.jx_type in NUMBER_TYPES]
            if len(columns) != 1:
                Log.error(
                    "Do not know how to perform stats on columns with more than one type (script probably)"
                )
            # REGULAR STATS
            acc.add(
                ExprAggs(
                    canonical_name,
                    {"extended_stats": {
                        "field": first(columns).es_column
                    }}, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            acc.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "field": first(columns).es_column,
                            "percents": [50]
                        }
                    }, select_median))

        elif s.aggregate == "union":
            for column in columns:
                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.terms = new HashSet()',
                        'map_script':
                        'for (v in doc[' + quote(column.es_column) +
                        '].values) params._agg.terms.add(v);',
                        'combine_script':
                        'return params._agg.terms.toArray()',
                        'reduce_script':
                        'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                    }
                }
                stats_name = column.es_column
                acc.add(
                    NestedAggs(column.nested_path[0]).add(
                        ExprAggs(stats_name, script, s)))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "count_values":
            # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS
            # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD
            for column in columns:
                script = {
                    "scripted_metric": {
                        'params': {
                            "_agg": {}
                        },
                        'init_script':
                        'params._agg.terms = new HashMap()',
                        'map_script':
                        'for (v in doc[' + quote(column.es_column) +
                        '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);',
                        'combine_script':
                        'return params._agg.terms',
                        'reduce_script':
                        '''
                        HashMap output = new HashMap(); 
                        for (agg in params._aggs) {
                            if (agg!=null){
                                for (e in agg.entrySet()) {
                                    String key = String.valueOf(e.getKey());
                                    output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0));
                                } 
                            }
                        } 
                        return output;
                    '''
                    }
                }
                stats_name = encode_property(column.es_column)
                acc.add(
                    NestedAggs(column.nested_path[0]).add(
                        ExprAggs(stats_name, script, s)))
            s.pull = jx_expression_to_function("value")
        else:
            if not columns:
                s.pull = jx_expression_to_function(NULL)
            else:
                for c in columns:
                    acc.add(
                        NestedAggs(c.nested_path[0]).add(
                            ExprAggs(
                                canonical_name,
                                {"extended_stats": {
                                    "field": c.es_column
                                }}, s)))
                s.pull = jx_expression_to_function(aggregates[s.aggregate])
Пример #7
0
def extract_rows(es, es_query, query):
    is_list = isinstance(query.select, list)
    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    columns = schema.columns
    leaf_columns = set(c.names["."] for c in columns if c.type not in STRUCT and (c.nested_path[0] == "." or c.es_column == c.nested_path[0]))
    nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".")

    i = 0
    source = "fields"
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp):
            new_name_prefix = select.name + "\\." if select.name != "." else ""
            term = select.value.term
            if isinstance(term, Variable):

                if term.var == ".":
                    es_query.fields = None
                    source = "_source"
                    for cname, cs in schema.lookup.items():
                        for c in cs:
                            if c.type not in STRUCT and c.es_column != "_id":
                                new_name = new_name_prefix + literal_field(cname)
                                new_select.append({
                                    "name": new_name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": new_name, "index": i, "child": "."}
                                })
                                i += 1
                else:
                    prefix = term.var + "."
                    prefix_length = len(prefix)
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            suffix = cname[prefix_length:]
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_name = new_name_prefix + literal_field(suffix)
                                    new_select.append({
                                        "name": new_name,
                                        "value": Variable(c.es_column),
                                        "put": {"name": new_name, "index": i, "child": "."}
                                    })
                                    i += 1

        elif isinstance(select.value, Variable):
            if select.value.var == ".":
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            elif select.value.var == "_id":
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "pull": "_id",
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            elif select.value.var in nested_columns or [c for c in nested_columns if c.startswith(select.value.var+".")]:
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            else:
                prefix = select.value.var + "."
                prefix_length = len(prefix)
                net_columns = [c for c in leaf_columns if c.startswith(prefix)]
                if not net_columns:
                    # LEAF
                    if es_query.fields is not None:
                        es_query.fields.append(encode_property(select.value.var))
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": i, "child": "."}
                    })
                    i += 1
                else:
                    # LEAVES OF OBJECT
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_select.append({
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {"name": select.name, "index": i, "child": cname[prefix_length:]}
                                    })
                    i += 1
        else:
            es_query.script_fields[literal_field(select.name)] = {"script": select.value.to_ruby()}
            new_select.append({
                "name": select.name,
                "pull": "fields." + literal_field(select.name),
                "put": {"name": select.name, "index": i, "child": "."}
            })
            i += 1

    for n in new_select:
        if n.pull:
            continue
        if source == "_source":
            n.pull = concat_field("_source", n.value.var)
        elif isinstance(n.value, Variable):
            n.pull = concat_field("fields", literal_field(encode_property(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)