def format_cube(decoders, aggs, start, query, select): # decoders = sorted(decoders, key=lambda d: -d.edge.dim) # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = s.pull(agg) m[coord] = v except Exception as e: # THIS HAPPENS WHEN ES RETURNS MORE TUPLE COMBINATIONS THAN DOCUMENTS if agg.get('doc_count') != 0: Log.error("Programmer error", cause=e) cube = Cube( query.select, sort_using_key(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY {s.name: m for s, m in matricies} ) cube.frum = query return cube
def format_cube_from_aggop(decoders, aggs, start, query, select): agg = drill(aggs) matricies = [(s, Matrix(dims=[], zeros=s.default)) for s in select] for s, m in matricies: m[tuple()] = s.pull(agg) cube = Cube(query.select, [], {s.name: m for s, m in matricies}) cube.frum = query return cube
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def format_cube(T, select, query=None): with Timer("format table"): table = format_table(T, select, query) if len(table.data) == 0: return Cube( scrub_select(select), edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": 0, "interval": 1}}], data={h: Matrix(list=[]) for i, h in enumerate(table.header)} ) cols = transpose(*unwrap(table.data)) return Cube( scrub_select(select), edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(table.data), "interval": 1}}], data={h: Matrix(list=cols[i]) for i, h in enumerate(table.header)} )
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = FlatList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = transpose(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def format_cube(aggs, es_query, query, decoders, all_selects): new_edges = count_dim(aggs, es_query, decoders) dims = [] for e in new_edges: if is_op(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) if any(s.default != canonical_aggregates[s.aggregate].default for s in all_selects): # UNUSUAL DEFAULT VALUES MESS THE union() FUNCTION is_default = Matrix(dims=dims, zeros=True) matricies = {s.name: Matrix(dims=dims) for s in all_selects} for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders): for select in selects: m = matricies[select.name] v = select.pull(agg) if v == None: continue is_default[coord] = False union(m, coord, v, select.aggregate) # FILL THE DEFAULT VALUES for c, v in is_default: if v: for s in all_selects: matricies[s.name][c] = s.default else: matricies = {s.name: Matrix(dims=dims, zeros=s.default) for s in all_selects} for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders): for select in selects: m = matricies[select.name] v = select.pull(agg) union(m, coord, v, select.aggregate) cube = Cube( query.select, sort_using_key(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY matricies ) cube.frum = query return cube
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_variable_name(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter": { "exists": { "field": s.value } } } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es_post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": query.where.to_esfilter() } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": query.where.to_es_filter() } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es_post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def save(self, query): query.meta = None json = convert.value2json(query) hash = unicode2utf8(json) # TRY MANY HASHES AT ONCE hashes = [None] * HASH_BLOCK_SIZE for i in range(HASH_BLOCK_SIZE): hash = hashlib.sha1(hash).digest() hashes[i] = hash short_hashes = [ convert.bytes2base64(h[0:6]).replace("/", "_") for h in hashes ] available = {h: True for h in short_hashes} existing = self.es.query({ "from": "saved_queries", "where": { "terms": { "hash": short_hashes } }, "meta": { "timeout": "2second" } }) for e in Cube(select=existing.select, edges=existing.edges, data=existing.data).values(): if e.query == json: return e.hash available[e.hash] = False # THIS WILL THROW AN ERROR IF THERE ARE NONE, HOW UNLUCKY! best = [h for h in short_hashes if available[h]][0] self.queue.add({ "id": best, "value": { "hash": best, "create_time": Date.now(), "last_used": Date.now(), "query": json } }) Log.note("Saved query as {{hash}}", hash=best) return best
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=lambda: windows.name2accumulator.get(s.aggregate) (**s)) for s in select } where = jx_expression_to_function(query.where) coord = [None] * len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION( e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from jx_python.containers.cube import Cube output = Cube(select, query.edges, result) return output
def save(self, query): """ SAVE query TO ES FOR LATER RECOVERY :param query: :return: HAS TO USE FOR RECOVERY """ meta, query.meta = query.meta, None json = convert.value2json(query) query.meta = meta hash = json.encode("utf8") # TRY MANY HASHES AT ONCE hashes = [None] * HASH_BLOCK_SIZE for i in range(HASH_BLOCK_SIZE): hash = hashlib.sha1(hash).digest() hashes[i] = hash short_hashes = [convert.bytes2base64(h[0:6]).replace("/", "_") for h in hashes] available = {h: True for h in short_hashes} existing = self.es.query( { "from": "saved_queries", "where": {"terms": {"hash": short_hashes}}, "meta": {"timeout": "2second"}, } ) for e in Cube( select=existing.select, edges=existing.edges, data=existing.data ).values(): if e.query == json: return e.hash available[e.hash] = False # THIS WILL THROW AN ERROR IF THERE ARE NONE, HOW UNLUCKY! best = first(h for h in short_hashes if available[h]) self.queue.add( { "id": best, "value": { "hash": best, "create_time": Date.now(), "last_used": Date.now(), "query": json, }, } ) if meta.testing: while True: verify = self.es.query( { "from": "saved_queries", "where": {"terms": {"hash": best}}, "meta": {"timeout": "2second"}, "select": "query", "format": "list", } ) if verify.data: break Log.alert("wait for saved query") Till(seconds=1).wait() Log.note("Saved {{json}} query as {{hash}}", json=json, hash=best) return best
def es_terms(es, mvel, query): """ RETURN LIST OF ALL EDGE QUERIES EVERY FACET IS NAMED <select.name>, <c1>, ... <cN> WHERE <ci> ARE THE ELEMENT COORDINATES WE TRY TO PACK DIMENSIONS INTO THE TERMS TO MINIMIZE THE CROSS-PRODUCT EXPLOSION """ if len(query.edges) == 2: return _es_terms2(es, mvel, query) select = listwrap(query.select) FromES = build_es_query(query) packed_term = compileEdges2Term(mvel, query.edges, wrap([])) for s in select: FromES.facets[s.name] = { "terms": { "field": packed_term.field, "script_field": packed_term.expression, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } term2Parts = packed_term.term2parts data = es_post(es, FromES, query.limit) # GETTING ALL PARTS WILL EXPAND THE EDGES' DOMAINS # BUT HOW TO UNPACK IT FROM THE term FASTER IS UNKNOWN for k, f in data.facets.items(): for t in f.terms: term2Parts(t.term) # NUMBER ALL EDGES FOR jx INDEXING for f, e in enumerate(query.edges): e.index = f if e.domain.type in ["uid", "default"]: # e.domain.partitions = jx.sort(e.domain.partitions, "value") for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE output = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): for term in facet.terms: term_coord = term2Parts(term.term).dataIndex for s in select: try: output[s.name][term_coord] = term[aggregates[s.aggregate]] except Exception as e: # USUALLY CAUSED BY output[s.name] NOT BEING BIG ENOUGH TO HANDLE NULL COUNTS pass cube = Cube(query.select, query.edges, output) cube.query = query return cube
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error( "There is more than one open-ended edge: self can not be handled" ) specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_variable_name(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges) * len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": { "aggregate": "count" }, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note( "{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count=len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error( "not implemented yet" ) # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({ "name": fedge.domain.name, "value": parts[f] }) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_variable_name(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_variable_name(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter( {"and": condition}) data = es_post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index] + ( special.dataIndex, ) + pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({ "and": [query.where, { "term": { query.edges[0].value: v } }] }) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{ "name": v, "value": v } for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and isinstance(select[0].value, LeavesOp): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "sort": query.sort, "size": 0 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) if len(select) == 1 and isinstance(select[0].value, LeavesOp): # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = transpose(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data(meta={"esquery": FromES}, data=cube)
def es_terms(es, mvel, query): """ RETURN LIST OF ALL EDGE QUERIES EVERY FACET IS NAMED <select.name>, <c1>, ... <cN> WHERE <ci> ARE THE ELEMENT COORDINATES WE TRY TO PACK DIMENSIONS INTO THE TERMS TO MINIMIZE THE CROSS-PRODUCT EXPLOSION """ if len(query.edges) == 2: return _es_terms2(es, mvel, query) select = listwrap(query.select) FromES = build_es_query(query) packed_term = compileEdges2Term(mvel, query.edges, wrap([])) for s in select: FromES.facets[s.name] = { "terms": { "field": packed_term.field, "script_field": packed_term.expression, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } term2Parts = packed_term.term2parts data = es_post(es, FromES, query.limit) # GETTING ALL PARTS WILL EXPAND THE EDGES' DOMAINS # BUT HOW TO UNPACK IT FROM THE term FASTER IS UNKNOWN for k, f in data.facets.items(): for t in f.terms: term2Parts(t.term) # NUMBER ALL EDGES FOR jx INDEXING for f, e in enumerate(query.edges): e.index = f if e.domain.type in ["uid", "default"]: # e.domain.partitions = jx.sort(e.domain.partitions, "value") for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE output = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): for term in facet.terms: term_coord = term2Parts(term.term).dataIndex for s in select: try: output[s.name][term_coord] = term[aggregates[s.aggregate]] except Exception as e: # USUALLY CAUSED BY output[s.name] NOT BEING BIG ENOUGH TO HANDLE NULL COUNTS pass cube = Cube(query.select, query.edges, output) cube.query = query return cube
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.__data__()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.default) for s in select } where = jx_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = jx_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from jx_python.containers.cube import Cube return Cube(select, query.edges, result)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_variable_name(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_variable_name(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_variable_name(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es_post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube