def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields=None source = "_source" elif s == ".": es_query.fields=None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] return extract_rows(es, es_query, source, select, query)
def test_eq2(self): where = {"eq": {"a": 1, "b": 2}} result = simplify_esfilter(jx_expression(where).to_esfilter()) if USE_BOOL_MUST: self.assertEqual( result, {"bool": { "must": [{ "term": { "a": 1 } }, { "term": { "b": 2 } }] }}) else: self.assertEqual(result, {"and": [{ "term": { "a": 1 } }, { "term": { "b": 2 } }]})
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields = None source = "_source" elif s == ".": es_query.fields = None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] return extract_rows(es, es_query, source, select, query)
def _normalize_where(where, schema=None): if where == None: return TRUE_FILTER if schema == None: return where where = simplify_esfilter(_where_terms(where, where, schema)) return where
def append_query(self, es_query, start): #TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): nest = wrap({ "aggs": { "_match": set_default( {"terms": { "field": v, "size": self.domain.limit }}, es_query) } }) if self.edge.allowNulls: nest.aggs._missing = set_default( {"missing": { "field": v }}, es_query ) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER es_query = nest if self.domain.where: filter = simplify_esfilter(self.domain.where) es_query = { "aggs": { "_filter": set_default({"filter": filter}, es_query) } } return es_query
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def test_range_packing1(self): where = {"and": [ {"gt": {"a": 20}}, {"lt": {"a": 40}} ]} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"range": {"a": {"gt": 20, "lt": 40}}})
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(query.where.to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = DictList() return extract_rows(es, es_query, query)
def build_es_query(query): output = wrap({"query": {"match_all": {}}, "from": 0, "size": 100 if DEBUG else 0, "sort": [], "facets": {}}) if DEBUG: # TO LIMIT RECORDS TO WHAT'S IN FACETS output.query = {"filtered": {"query": {"match_all": {}}, "filter": simplify_esfilter(query.where)}} return output
def test_range_packing2(self): where = {"and": [ {"gte": {"build.date": 1429747200}}, {"lt": {"build.date": 1429920000}} ]} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"range": {"build.date": {"gte": Date("23 APR 2015").unix, "lt": Date("25 APR 2015").unix}}})
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def append_query(self, es_query, start): self.start = start for i, (k, v) in enumerate(self.fields): es_query = wrap({"aggs": { "_match": set_default({"terms": {"field": v}}, es_query), "_missing": set_default({"missing": {"field": v}}, es_query), }}) if self.edge.domain.where: filter = simplify_esfilter(self.edge.domain.where) es_query = {"aggs": {"_filter": set_default({"filter": filter}, es_query)}} return es_query
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": simplify_esfilter(query.where.to_esfilter()) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = {s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[aggregates[s.aggregate]]) for s in select} cube = Cube(query.select, [], matricies) cube.frum = query return cube
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): nest = wrap({"aggs": {"_match": set_default({"terms": {"field": v, "size": self.domain.limit}}, es_query)}}) if self.edge.allowNulls: nest.aggs._missing = set_default({"missing": {"field": v}}, es_query) es_query = nest if self.domain.where: filter = simplify_esfilter(self.domain.where) es_query = {"aggs": {"_filter": set_default({"filter": filter}, es_query)}} return es_query
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(query.where) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR Qb INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges)] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def build_es_query(query): output = wrap({ "query": {"match_all": {}}, "from": 0, "size": 100 if DEBUG else 0, "sort": [], "facets": { } }) if DEBUG: # TO LIMIT RECORDS TO WHAT'S IN FACETS output.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where) } } return output
def append_query(self, es_query, start): self.start = start for i, (k, v) in enumerate(self.fields): es_query = wrap({ "aggs": { "_match": set_default({"terms": { "field": v }}, es_query), "_missing": set_default({"missing": { "field": v }}, es_query), } }) if self.edge.domain.where: filter = simplify_esfilter(self.edge.domain.where) es_query = { "aggs": { "_filter": set_default({"filter": filter}, es_query) } } return es_query
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many) > 1: canonical_name = literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field( s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[ s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [ AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, []) ] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)}) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": { "_nested": set_default( { "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query) } }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def buildCondition(mvel, edge, partition): """ RETURN AN ES FILTER OBJECT """ output = {} if edge.domain.isFacet: # MUST USE THIS' esFacet condition = wrap(coalesce(partition.where, {"and": []})) if partition.min and partition.max and is_keyword(edge.value): condition["and"].append({ "range": {edge.value: {"gte": partition.min, "lt": partition.max}} }) # ES WILL FREAK OUT IF WE SEND {"not":{"and":x}} (OR SOMETHING LIKE THAT) return simplify_esfilter(condition) elif edge.range: # THESE REALLY NEED FACETS TO PERFORM THE JOIN-TO-DOMAIN # USE MVEL CODE if edge.domain.type in domains.ALGEBRAIC: output = {"and": []} if edge.range.mode and edge.range.mode == "inclusive": # IF THE range AND THE partition OVERLAP, THEN MATCH IS MADE if is_keyword(edge.range.min): output["and"].append({"range": {edge.range.min: {"lt": es09.expressions.value2value(partition.max)}}}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.min + " < " + es09.expressions.value2MVEL(partition.max) )}}) if is_keyword(edge.range.max): output["and"].append({"or": [ {"missing": {"field": edge.range.max}}, {"range": {edge.range.max, {"gt": es09.expressions.value2value(partition.min)}}} ]}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.max + " > " + es09.expressions.value2MVEL(partition.min))}}) else: # SNAPSHOT - IF range INCLUDES partition.min, THEN MATCH IS MADE if is_keyword(edge.range.min): output["and"].append({"range": {edge.range.min: {"lte": es09.expressions.value2value(partition.min)}}}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.min + "<=" + es09.expressions.value2MVEL(partition.min) )}}) if is_keyword(edge.range.max): output["and"].append({"or": [ {"missing": {"field": edge.range.max}}, {"range": {edge.range.max, {"gte": es09.expressions.value2value(partition.min)}}} ]}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( es09.expressions.value2MVEL(partition.min) + " <= " + edge.range.max )}}) return output else: Log.error("Do not know how to handle range query on non-continuous domain") elif not edge.value: # MUST USE THIS' esFacet, AND NOT(ALL THOSE ABOVE) return partition.esfilter elif is_keyword(edge.value): # USE FAST ES SYNTAX if edge.domain.type in domains.ALGEBRAIC: output.range = {} output.range[edge.value] = {"gte": es09.expressions.value2query(partition.min), "lt": es09.expressions.value2query(partition.max)} elif edge.domain.type == "set": if partition.value: if partition.value != edge.domain.getKey(partition): Log.error("please ensure the key attribute of the domain matches the value attribute of all partitions, if only because we are now using the former") # DEFAULT TO USING THE .value ATTRIBUTE, IF ONLY BECAUSE OF LEGACY REASONS output.term = {edge.value: partition.value} else: output.term = {edge.value: edge.domain.getKey(partition)} elif edge.domain.type == "default": output.term = dict() output.term[edge.value] = partition.value else: Log.error("Edge \"" + edge.name + "\" is not supported") return output else: # USE MVEL CODE if edge.domain.type in domains.ALGEBRAIC: output.script = {"script": edge.value + ">=" + es09.expressions.value2MVEL(partition.min) + " and " + edge.value + "<" + es09.expressions.value2MVEL(partition.max)} else: output.script = {"script": "( " + edge.value + " ) ==" + es09.expressions.value2MVEL(partition.value)} code = es09.expressions.addFunctions(output.script.script) output.script.script = code.head + code.body return output
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex and len(select) == 1: if not select[0].value: FromES.query = {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where) }} FromES.size = 1 # PREVENT QUERY CHECKER FROM THROWING ERROR elif isKeyword(select[0].value): FromES.facets.mvel = { "terms": { "field": select[0].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } if query.sort: s = query.sort if len(s) > 1: Log.error("can not sort by more than one field") s0 = s[0] if s0.field != select[0].value: Log.error("can not sort by anything other than count, or term") FromES.facets.terms.order = "term" if s0.sort >= 0 else "reverse_term" elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1: if not select[0].value: # SPECIAL CASE FOR SINGLE COUNT output = Matrix(value=data.hits.total) cube = Cube(query.select, [], {select[0].name: output}) elif isKeyword(select[0].value): # SPECIAL CASE FOR SINGLE TERM T = data.facets.terms output = Matrix.wrap([t.term for t in T]) cube = Cube(query.select, [], {select[0].name: output}) else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) cube.frum = query return cube
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_column_map = {c.name: unwraplist(c.es_column) for c in frum.schema.all_columns} es_query = Dict() new_select = Dict() #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_"+literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum, map_=es_column_map) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": frum.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex and len(select) == 1: if not select[0].value: FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where) } } FromES.size = 1 # PREVENT QUERY CHECKER FROM THROWING ERROR elif isKeyword(select[0].value): FromES.facets.mvel = { "terms": { "field": select[0].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } if query.sort: s = query.sort if len(s) > 1: Log.error("can not sort by more than one field") s0 = s[0] if s0.field != select[0].value: Log.error( "can not sort by anything other than count, or term") FromES.facets.terms.order = "term" if s0.sort >= 0 else "reverse_term" elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1: if not select[0].value: # SPECIAL CASE FOR SINGLE COUNT output = Matrix(value=data.hits.total) cube = Cube(query.select, [], {select[0].name: output}) elif isKeyword(select[0].value): # SPECIAL CASE FOR SINGLE TERM T = data.facets.terms output = Matrix.wrap([t.term for t in T]) cube = Cube(query.select, [], {select[0].name: output}) else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) cube.frum = query return cube
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many)>1: canonical_name=literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, [])] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": {"_nested": set_default({ "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query)} }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.seconds output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format= query.format, cause=e) Log.error("Some problem", e)
def test_in(self): where = {"in": {"a": [1, 2]}} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"terms": {"a": [1, 2]}})
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter( jx_expression(query.where).to_esfilter()) } }, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where.to_esfilter()) } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Dict(meta={"esquery": FromES}, data=cube)
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) }}, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where.to_esfilter()) }}, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data( meta={"esquery": FromES}, data=cube )
def test_eq1(self): where = {"eq": {"a": 20}} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"term": {"a": 20}})
def es_deepop(es, query): schema = query.frum.schema columns = schema.columns query_path = schema.query_path map_to_local = {k: get_pull(c[0]) for k, c in schema.lookup.items()} # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.type not in STRUCT and c.es_column != "_id": if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": c.names[query_path], "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": literal_field(c.names[query_path]), "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = set(c.names[query_path] for c in columns) for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.name.lstrip(".") n.put.name = literal_field(n.name) col_names.add(n.name) else: prefix = schema[s.value.term.var][0].names["."] + "." prefix_length = len(prefix) for c in columns: cname = c.names["."] if cname.startswith(prefix) and c.type not in STRUCT: pull = get_pull(c) if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + cname[prefix_length:], "pull": pull, "nested_path": c.nested_path[0], "put": { "name": s.name + "." + literal_field(cname[prefix_length:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.type not in STRUCT and c.es_column != "_id": if len(c.nested_path) == 1: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": ".", "index": i, "child": c.es_column} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: prefix = schema[s.value.var][0] if not prefix: net_columns = [] else: parent = prefix.es_column+"." prefix_length = len(parent) net_columns = [c for c in columns if c.es_column.startswith(parent) and c.type not in STRUCT] if not net_columns: pull = get_pull(prefix) if len(prefix.nested_path) == 1: es_query.fields += [prefix.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": prefix.nested_path[0], "put": {"name": s.name, "index": i, "child": "."} }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if len(n.nested_path) == 1: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": {"name": s.name, "index": i, "child": n.es_column[prefix_length:]} }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Data( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def es_deepop(es, query): columns = query.frum.get_columns(query.frum.name) query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_to_es_columns = {c.name: c.es_column for c in columns} map_to_local = { c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.es_column) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, query.frum, map_to_es_columns) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.es_column[len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column) i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var==".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": literal_field(c.name), "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.put.name = n.name.lstrip(".") else: column = s.term.value.var+"." prefix = len(column) for c in columns: if c.name.startswith(column) and c.type not in ["object", "nested"]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": ".", "index": i, "child": c.es_column} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: column = columns[(s.value.var,)] parent = column.es_column+"." prefix = len(parent) net_columns = [c for c in columns if c.es_column.startswith(parent) and c.type not in ["object", "nested"]] if not net_columns: pull = get_pull(column) if not column.nested_path: es_query.fields += [column.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(column.nested_path)[0], "put": {"name": s.name, "index": i, "child": "."} }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if not n.nested_path: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": {"name": s.name, "index": i, "child": n.es_column[prefix:]} }) i += 1 else: expr = s.value for v in expr.vars(): for n in columns: if n.name == v: if not n.nested_path: es_query.fields += [n.es_column] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Dict( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def test_ne2(self): where = {"neq": {"a": 1}} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"not": {"term": {"a": 1}}})