def es_query_proto(path, selects, wheres, schema): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME) :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS :return: (es_query, filters_map) TUPLE """ output = None last_where = MATCH_ALL for p in reversed(sorted(wheres.keys() | set(selects.keys()))): where = wheres.get(p) select = selects.get(p) if where: where = AndOp(where).partial_eval().to_esfilter(schema) if output: where = es_or([es_and([output, where]), where]) else: if output: if last_where is MATCH_ALL: where = es_or([output, MATCH_ALL]) else: where = output else: where = MATCH_ALL if p == ".": output = set_default( { "from": 0, "size": 0, "sort": [], "query": where }, select.to_es()) else: output = { "nested": { "path": p, "inner_hits": set_default({"size": 100000}, select.to_es()) if select else None, "query": where } } last_where = where return output
def es_query_proto(path, selects, wheres, schema): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME) :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS :return: (es_query, filters_map) TUPLE """ output = None last_where = MATCH_ALL for p in reversed(sorted( wheres.keys() | set(selects.keys()))): where = wheres.get(p) select = selects.get(p) if where: where = AndOp(where).partial_eval().to_esfilter(schema) if output: where = es_or([es_and([output, where]), where]) else: if output: if last_where is MATCH_ALL: where = es_or([output, MATCH_ALL]) else: where = output else: where = MATCH_ALL if p == ".": output = set_default( { "from": 0, "size": 0, "sort": [], "query": where }, select.to_es() ) else: output = {"nested": { "path": p, "inner_hits": set_default({"size": 100000}, select.to_es()) if select else None, "query": where }} last_where = where return output
def to_esfilter(self, schema): # TODO: REPLICATE THIS WHOLE expression.py SO IT IS CLEAR ES5 QUERIES ARE A BIT DIFFERENT if schema.snowflake.namespace.es_cluster.version.startswith("5."): # VERSION 5.2.x # WE REQUIRE EXIT-EARLY SEMANTICS, OTHERWISE EVERY EXPRESSION IS A SCRIPT EXPRESSION # {"bool":{"should" :[a, b, c]}} RUNS IN PARALLEL # {"bool":{"must_not":[a, b, c]}} ALSO RUNS IN PARALLEL # OR(x) == NOT(AND(NOT(xi) for xi in x)) output = es_not(es_and([ NotOp("not", t).partial_eval().to_esfilter(schema) for t in self.terms ])) return output else: # VERSION 6.2 return es_or([t.partial_eval().to_esfilter(schema) for t in self.terms])
def to_esfilter(self, schema): if is_op(self.term, MissingOp_) and is_op(self.term.expr, Variable_): # PREVENT RECURSIVE LOOP v = self.term.expr.var cols = schema.values(v, (OBJECT, NESTED)) if len(cols) == 0: return MATCH_NONE elif len(cols) == 1: return {"exists": {"field": first(cols).es_column}} else: return es_or([{ "exists": { "field": c.es_column } } for c in cols]) else: operand = ES52[self.term].to_esfilter(schema) return es_not(operand)
def to_esfilter(self, schema): if schema.snowflake.namespace.es_cluster.version.startswith("5."): # VERSION 5.2.x # WE REQUIRE EXIT-EARLY SEMANTICS, OTHERWISE EVERY EXPRESSION IS A SCRIPT EXPRESSION # {"bool":{"should" :[a, b, c]}} RUNS IN PARALLEL # {"bool":{"must_not":[a, b, c]}} ALSO RUNS IN PARALLEL # OR(x) == NOT(AND(NOT(xi) for xi in x)) output = es_not( es_and([ NotOp(t).partial_eval().to_esfilter(schema) for t in self.terms ])) return output else: # VERSION 6.2+ return es_or([ ES52[t].partial_eval().to_esfilter(schema) for t in self.terms ])
def _normalize(esfilter): """ TODO: DO NOT USE Data, WE ARE SPENDING TOO MUCH TIME WRAPPING/UNWRAPPING REALLY, WE JUST COLLAPSE CASCADING `and` AND `or` FILTERS """ if esfilter == MATCH_ALL or esfilter == MATCH_NONE or esfilter.isNormal: return esfilter # Log.note("from: " + convert.value2json(esfilter)) isDiff = True while isDiff: isDiff = False if esfilter.bool.filter: terms = esfilter.bool.filter for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 == i1: continue # SAME, IGNORE # TERM FILTER ALREADY ASSUMES EXISTENCE with suppress_exception: if (t0.exists.field != None and t0.exists.field == t1.term.items()[0][0]): terms[i0] = MATCH_ALL continue # IDENTICAL CAN BE REMOVED with suppress_exception: if t0 == t1: terms[i0] = MATCH_ALL continue # MERGE range FILTER WITH SAME FIELD if i0 > i1: continue # SAME, IGNORE with suppress_exception: f0, tt0 = t0.range.items()[0] f1, tt1 = t1.range.items()[0] if f0 == f1: set_default(terms[i0].range[literal_field(f1)], tt1) terms[i1] = MATCH_ALL output = [] for a in terms: if is_container(a): from mo_logs import Log Log.error("and clause is not allowed a list inside a list") a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a == MATCH_ALL: isDiff = True continue if a == MATCH_NONE: return MATCH_NONE if a.bool.filter: isDiff = True a.isNormal = None output.extend(a.bool.filter) else: a.isNormal = None output.append(a) if not output: return MATCH_ALL elif len(output) == 1: # output[0].isNormal = True esfilter = output[0] break elif isDiff: esfilter = es_and(output) continue if esfilter.bool.should: output = [] for a in esfilter.bool.should: a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a.bool.should: a.isNormal = None isDiff = True output.extend(a.bool.should) else: a.isNormal = None output.append(a) if not output: return MATCH_NONE elif len(output) == 1: esfilter = output[0] break elif isDiff: esfilter = wrap(es_or(output)) continue if esfilter.term != None: if esfilter.term.keys(): esfilter.isNormal = True return esfilter else: return MATCH_ALL if esfilter.terms: for k, v in esfilter.terms.items(): if len(v) > 0: if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: output = es_or( [es_missing(k), { "terms": { k: rest } }]) else: output = es_missing(k) output.isNormal = True return output else: esfilter.isNormal = True return esfilter return MATCH_NONE if esfilter.bool.must_not: _sub = esfilter.bool.must_not sub = _normalize(_sub) if sub == MATCH_NONE: return MATCH_ALL elif sub == MATCH_ALL: return MATCH_NONE elif sub is not _sub: sub.isNormal = None return wrap({"bool": {"must_not": sub, "isNormal": True}}) else: sub.isNormal = None esfilter.isNormal = True return esfilter
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance( select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." }, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." } }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) } }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)])) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [ c.es_column ] child = relative_field( untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc( nested_path, Variable( relative_field( s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[ nested_path].nested.inner_hits.stored_fields += [ c.es_column ] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script( painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _normalize(esfilter): """ TODO: DO NOT USE Data, WE ARE SPENDING TOO MUCH TIME WRAPPING/UNWRAPPING REALLY, WE JUST COLLAPSE CASCADING `and` AND `or` FILTERS """ if esfilter == MATCH_ALL or esfilter == MATCH_NONE or esfilter.isNormal: return esfilter # Log.note("from: " + convert.value2json(esfilter)) isDiff = True while isDiff: isDiff = False if esfilter.bool.filter: terms = esfilter.bool.filter for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 == i1: continue # SAME, IGNORE # TERM FILTER ALREADY ASSUMES EXISTENCE with suppress_exception: if t0.exists.field != None and t0.exists.field == t1.term.items()[0][0]: terms[i0] = MATCH_ALL continue # IDENTICAL CAN BE REMOVED with suppress_exception: if t0 == t1: terms[i0] = MATCH_ALL continue # MERGE range FILTER WITH SAME FIELD if i0 > i1: continue # SAME, IGNORE with suppress_exception: f0, tt0 = t0.range.items()[0] f1, tt1 = t1.range.items()[0] if f0 == f1: set_default(terms[i0].range[literal_field(f1)], tt1) terms[i1] = MATCH_ALL output = [] for a in terms: if isinstance(a, (list, set)): from mo_logs import Log Log.error("and clause is not allowed a list inside a list") a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a == MATCH_ALL: isDiff = True continue if a == MATCH_NONE: return MATCH_NONE if a.bool.filter: isDiff = True a.isNormal = None output.extend(a.bool.filter) else: a.isNormal = None output.append(a) if not output: return MATCH_ALL elif len(output) == 1: # output[0].isNormal = True esfilter = output[0] break elif isDiff: esfilter = es_and(output) continue if esfilter.bool.should: output = [] for a in esfilter.bool.should: a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a.bool.should: a.isNormal = None isDiff = True output.extend(a.bool.should) else: a.isNormal = None output.append(a) if not output: return MATCH_NONE elif len(output) == 1: esfilter = output[0] break elif isDiff: esfilter = wrap({"bool": {"should": output}}) continue if esfilter.term != None: if esfilter.term.keys(): esfilter.isNormal = True return esfilter else: return MATCH_ALL if esfilter.terms: for k, v in esfilter.terms.items(): if len(v) > 0: if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: output = es_or([ es_missing(k), {"terms": {k: rest}} ]) else: output = es_missing(k) output.isNormal = True return output else: esfilter.isNormal = True return esfilter return MATCH_NONE if esfilter.bool.must_not: _sub = esfilter.bool.must_not sub = _normalize(_sub) if sub == MATCH_NONE: return MATCH_ALL elif sub == MATCH_ALL: return MATCH_NONE elif sub is not _sub: sub.isNormal = None return wrap({"bool": {"must_not": sub, "isNormal": True}}) else: sub.isNormal = None esfilter.isNormal = True return esfilter
def to_esfilter(self, schema): return es_or([t.partial_eval().to_esfilter(schema) for t in self.terms])
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)} }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [c.es_column] child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)