def test_status(self, log): self.stats.action.test_status += 1 if not log.test: Log.error( "log has blank 'test' property! Do not know how to handle.") self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] test.stats.action.test_status += 1 if not test: self.tests[literal_field(log.test)] = test = Dict( test=log.test, start_time=log.time, missing_test_start=True) test.last_log_time = log.time test.stats[log.status.lower()] += 1 if log.subtest: test.subtests += [{ "name": log.subtest, "subtest": log.subtest, "ok": True if log.expected == None or log.expected == log.status else False, "status": log.status.lower(), "expected": log.expected.lower(), "timestamp": log.time, "message": log.message, "ordering": len(test.subtests) }]
def leaves(self, prefix=None): """ LIKE items() BUT RECURSIVE, AND ONLY FOR THE LEAVES (non dict) VALUES """ prefix = coalesce(prefix, "") output = [] for k, v in self.items(): if isinstance(v, Mapping): output.extend(wrap(v).leaves(prefix=prefix + literal_field(k) + ".")) else: output.append((prefix + literal_field(k), v)) return output
def leaves(self, prefix=None): """ LIKE items() BUT RECURSIVE, AND ONLY FOR THE LEAVES (non dict) VALUES """ prefix = coalesce(prefix, "") output = [] for k, v in self.items(): if isinstance(v, Mapping): output.extend( wrap(v).leaves(prefix=prefix + literal_field(k) + ".")) else: output.append((prefix + literal_field(k), v)) return output
def mainthread_transform(r): if r == None: return None output = Dict() for i in r.mainthread_readbytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readbytes = i[0] r.mainthread_readbytes = None for i in r.mainthread_writebytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writebytes = i[0] r.mainthread_writebytes = None for i in r.mainthread_readcount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readcount = i[0] r.mainthread_readcount = None for i in r.mainthread_writecount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writecount = i[0] r.mainthread_writecount = None r.mainthread = output.values()
def _index_values(self, doc, start_index, parent_index=-1, prefix=""): curr_index = doc[INDEX] = start_index doc[PARENT] = parent_index _index = self._index for k, v in doc.items(): k = literal_field(k) _type = _type_map[v.__class__] if _type == "object": self._index_values(v, start_index, prefix=k + ".") v = "." elif _type == "nested": for vv in v: curr_index = self._index_values(vv, curr_index + 1, start_index, prefix=k + ".") _type = "object" v = "." typed_key = k + ".$" + _type i = _index.get(typed_key) if i is None: i = _index[typed_key] = {} j = i.get(v) if j is None: j = i[v] = set() j |= {start_index} return curr_index
def format_list(T, select, source): data = [] for row in T: r = Dict() for s in select: if s.value == ".": r[s.name] = row[source] else: if source == "_source": r[s.name] = unwraplist(row[source][s.value]) elif isinstance(s.value, basestring): # fields r[s.name] = unwraplist(row[source][literal_field(s.value)]) else: r[s.name] = unwraplist(row[source][literal_field(s.name)]) data.append(r) return Dict(meta={"format": "list"}, data=data)
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)} return extract_rows(es, es_query, source, select, query)
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail=True try: if test==None and expected==None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = dot.get_attr(test, literal_field(k)) else: show_deta =False v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, set) and isinstance(expected, set): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, types.FunctionType): return expected(test) elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"): for a, b in zipall(test, expected): assertAlmostEqual(a, b, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception, e: Log.error( "{{test|json}} does not match expected {{expected|json}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e )
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = { "script": qb_expression_to_ruby(s.value) } return extract_rows(es, es_query, source, select, query)
def test_end(self, log): self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] if not test: self.tests[literal_field(log.test)] = test = Dict( test=log.test, start_time=log.time, missing_test_start=True) test.ok = True if log.expected == None or log.expected == log.status else False if not all(test.subtests.ok): test.ok = False test.result = log.status #TODO: REMOVE ME AFTER November 2015 test.status = log.status test.expected = coalesce(log.expected, log.status) test.end_time = log.time test.duration = coalesce(test.end_time - test.start_time, log.extra.runtime) test.extra = test.extra
def leaves(value, prefix=None): """ LIKE items() BUT RECURSIVE, AND ONLY FOR THE LEAVES (non dict) VALUES :param value: THE Mapping TO TRAVERSE :param prefix: OPTIONAL PREFIX GIVEN TO EACH KEY :return: Dict, WHICH EACH KEY BEING A PATH INTO value TREE """ prefix = coalesce(prefix, "") output = [] for k, v in value.items(): try: if isinstance(v, Mapping): output.extend(leaves(v, prefix=prefix + literal_field(k) + ".")) else: output.append((prefix + literal_field(k), unwrap(v))) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Do not know how to handle", cause=e)
def log(self, log): self.stats.action.log += 1 if not log.test: return self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] test.stats.action.log += 1 if not test: self.tests[literal_field(log.test)] = test = wrap({ "test": log.test, "start_time": log.time, "missing_test_start": True, }) test.last_log_time = log.time test.stats.action.log += 1
def crash(self, log): self.stats.action.crash += 1 if not log.test: return self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] if not test: self.tests[literal_field(log.test)] = test = Dict( test=log.test, start_time=log.time, crash=True, missing_test_start=True) test.ok = False test.result = log.status #TODO: REMOVE ME AFTER November 2015 test.status = log.status test.last_log_time = log.time test.missing_test_end = True
def format_list(T, select, source): data = [] for row in T: r = Dict() for s in select: if s.value == ".": r[s.name] = row[source] else: if source=="_source": r[s.name] = unwraplist(row[source][s.value]) elif isinstance(s.value, basestring): # fields r[s.name] = unwraplist(row[source][literal_field(s.value)]) else: r[s.name] = unwraplist(row[source][literal_field(s.name)]) data.append(r) return Dict( meta={"format": "list"}, data=data )
def _merge_mapping(a, b): """ MERGE TWO MAPPINGS, a TAKES PRECEDENCE """ for name, b_details in b.items(): a_details = a[literal_field(name)] if a_details.properties and not a_details.type: a_details.type = "object" if b_details.properties and not b_details.type: b_details.type = "object" if a_details: a_details.type = _merge_type[a_details.type][b_details.type] if b_details.type in ["object", "nested"]: _merge_mapping(a_details.properties, b_details.properties) else: a[literal_field(name)] = deepcopy(b_details) return a
def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True try: if test == None and expected == None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = dot.get_attr(test, literal_field(k)) else: show_deta = False v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, (set, list)) and isinstance(expected, set): test = set(test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception, _: pass else: Log.error( "Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test)
def extend(self, documents): parts = Dict() for d in wrap(documents): parent_key = etl2key(key2etl(d.id).source) d.value._id = d.id parts[literal_field(parent_key)] += [d.value] for k, docs in parts.items(): self._extend(k, docs) return parts.keys()
def format_table(T, select, source): header = [s.name for s in select] map = {s.name: i for i, s in enumerate(select)} # MAP FROM name TO COLUMN INDEX data = [] for row in T: r = [None] * len(header) for s in select: if s.value == ".": r[map[s.name]] = row[source] else: if source == "_source": r[map[s.name]] = unwraplist(row[source][s.value]) elif isinstance(s.value, basestring): # fields r[map[s.name]] = unwraplist(row[source][literal_field( s.value)]) else: r[map[s.name]] = unwraplist(row[source][literal_field( s.name)]) data.append(r) return Dict(meta={"format": "table"}, header=header, data=data)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search( { "fields": listwrap(schema._routing.path), "query": { "filtered": {"query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter()} }, "size": 200000, } ) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append( { "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)]), } } ) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode("utf-8") response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"} ) if response.errors: Log.error( "could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)], )
def format_table(T, select, source): header = [s.name for s in select] map = {s.name: i for i, s in enumerate(select)} # MAP FROM name TO COLUMN INDEX data = [] for row in T: r = [None] * len(header) for s in select: if s.value == ".": r[map[s.name]] = row[source] else: if source == "_source": r[map[s.name]] = unwraplist(row[source][s.value]) elif isinstance(s.value, basestring): # fields r[map[s.name]] = unwraplist(row[source][literal_field(s.value)]) else: r[map[s.name]] = unwraplist(row[source][literal_field(s.name)]) data.append(r) return Dict( meta={"format": "table"}, header=header, data=data )
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True try: if test == None and expected == None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = dot.get_attr(test, literal_field(k)) else: show_deta = False v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, set) and isinstance(expected, set): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, types.FunctionType): return expected(test) elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"): for a, b in zipall(test, expected): assertAlmostEqual(a, b, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception, e: Log.error("{{test|json}} does not match expected {{expected|json}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e)
def add_to_queue(work_queue, redo, bucket_name): now = Date.now() for r in redo: k = literal_field(r) counter[k] += 1 if counter[k] > 3: Log.error("Problem backfilling {{key}}: Tried >=3 times, giving up", key= r) continue work_queue.add({ "bucket": bucket_name, "key": r, "timestamp": now.unix, "date/time": now.format() })
def add_to_queue(work_queue, redo, bucket_name): now = Date.now() for r in redo: k = literal_field(r) counter[k] += 1 if counter[k] > 3: Log.error( "Problem backfilling {{key}}: Tried >=3 times, giving up", key=r) continue work_queue.add({ "bucket": bucket_name, "key": r, "timestamp": now.unix, "date/time": now.format() })
def __init__(self, *args, **kwargs): """ CALLING Dict(**something) WILL RESULT IN A COPY OF something, WHICH IS UNLIKELY TO BE USEFUL. USE wrap() INSTEAD """ if DEBUG: d = _get(self, "_dict") for k, v in kwargs.items(): d[literal_field(k)] = unwrap(v) else: if args: args0 = args[0] if isinstance(args0, Mapping): _set(self, "_dict", args0) else: _set(self, "_dict", _get(args[0], "__dict__")) elif kwargs: _set(self, "_dict", unwrap(kwargs)) else: _set(self, "_dict", {})
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def __setitem__(self, key, value): if key == "": from pyLibrary.debugs.logs import Log Log.error("key is empty string. Probably a bad idea") if key == None: return Null if key == ".": # SOMETHING TERRIBLE HAPPENS WHEN value IS NOT A Mapping; # HOPEFULLY THE ONLY OTHER METHOD RUN ON self IS unwrap() v = unwrap(value) _set(self, "_dict", v) return v if isinstance(key, str): key = key.decode("utf8") try: d = _get(self, "_dict") value = unwrap(value) if key.find(".") == -1: if value is None: d.pop(key, None) else: d[key] = value return self seq = _split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: d.pop(seq[-1], None) elif d == None: d[literal_field(seq[-1])] = value else: d[seq[-1]] = value return self except Exception, e: raise e
def __setitem__(self, key, value): if key == "": from pyLibrary.debugs.logs import Log Log.error("key is empty string. Probably a bad idea") if key == None: return Null if key == ".": # SOMETHING TERRIBLE HAPPENS WHEN value IS NOT A Mapping; # HOPEFULLY THE ONLY OTHER METHOD RUN ON self IS unwrap() v = unwrap(value) _set(self, "_dict", v) return v if isinstance(key, str): key = key.decode("utf8") try: d = _get(self, "_dict") value = unwrap(value) if key.find(".") == -1: if value is None: d.pop(key, None) else: d[key] = value return self seq = _split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: d.pop(seq[-1], None) elif d==None: d[literal_field(seq[-1])] = value else: d[seq[-1]] = value return self except Exception, e: raise e
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail=True try: if test==None and expected==None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = dot.get_attr(test, literal_field(k)) else: show_deta =False v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, (set, list)) and isinstance(expected, set): test = set(test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected ) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception, _: pass else: Log.error("Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test)
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": simplify_esfilter(query.where.to_esfilter()) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = {s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[aggregates[s.aggregate]]) for s in select} cube = Cube(query.select, [], matricies) cube.frum = query return cube
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many)>1: canonical_name=literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, [])] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": {"_nested": set_default({ "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query)} }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.seconds output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format= query.format, cause=e) Log.error("Some problem", e)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() column_names = set(c.name for c in query.frum.get_columns() if c.type not in ["object"] and (not c.nested_path or c.abs_name == c.nested_path or not c.nested_path)) source = "fields" i = 0 for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": es_query.fields = None source = "_source" net_columns = column_names - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 elif s.value == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in column_names: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in column_names if c.startswith(parent)] if not net_columns: if es_query.fields is not None: es_query.fields.append(s.value) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 elif isinstance(s.value, list): Log.error("need an example") if es_query.fields is not None: es_query.fields.extend([v for v in s.value]) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression(s.value).to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def process_output(self, log): self.logs[literal_field(log.test)] += [log] self.stats.action.process_output += 1 pass
def __init__(self, edge, query): self.start = None self.edge = edge self.name = literal_field(self.edge.name)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_column_map = {c.name: unwraplist(c.es_column) for c in frum.schema.all_columns} es_query = Dict() new_select = Dict() #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_"+literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum, map_=es_column_map) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": frum.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def es_deepop(es, query): columns = query.frum.get_columns(query.frum.name) query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted( columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_to_es_columns = {c.name: c.es_column for c in columns} map_to_local = { c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.es_column) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, query.frum, map_to_es_columns) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap( simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), { "not": { "nested": { "path": query_path, "filter": { "match_all": {} } } } } ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.es_column[ len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column) i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": literal_field(c.name), "index": i, "child": "." } }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.name = n.put.name = n.name.lstrip(".") else: column = s.term.value.var + "." prefix = len(column) for c in columns: if c.name.startswith(column) and c.type not in [ "object", "nested" ]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": { "name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": ".", "index": i, "child": c.es_column } }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": { "name": s.name, "index": i, "child": "." } }) i += 1 else: column = columns[(s.value.var, )] parent = column.es_column + "." prefix = len(parent) net_columns = [ c for c in columns if c.es_column.startswith(parent) and c.type not in ["object", "nested"] ] if not net_columns: pull = get_pull(column) if not column.nested_path: es_query.fields += [column.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(column.nested_path)[0], "put": { "name": s.name, "index": i, "child": "." } }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if not n.nested_path: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": { "name": s.name, "index": i, "child": n.es_column[prefix:] } }) i += 1 else: expr = s.value for v in expr.vars(): for n in columns: if n.name == v: if not n.nested_path: es_query.fields += [n.es_column] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression( expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": { "name": s.name, "index": i, "child": "." } }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es09.util.post(es, Dict(filter=more_filter, fields=es_query.fields), query.limit)) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "name": c.name } } }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { c.name: _counting_query(c) }, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": { "path": listwrap(c.nested_path)[0] }, "aggs": { "_nested": { "terms": { "field": c.es_column, "size": 0 } } } } else: query.aggs[literal_field(c.name)] = { "terms": { "field": c.es_column, "size": 0 } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) except Exception, e: if "IndexMissingException" in e and c.table.startswith( TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) Log.warning( "Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def es_deepop(es, query): columns = query.frum.get_columns(query.frum.name) query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_to_es_columns = {c.name: c.es_column for c in columns} map_to_local = { c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.es_column) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, query.frum, map_to_es_columns) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.es_column[len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column) i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var==".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": literal_field(c.name), "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.put.name = n.name.lstrip(".") else: column = s.term.value.var+"." prefix = len(column) for c in columns: if c.name.startswith(column) and c.type not in ["object", "nested"]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": ".", "index": i, "child": c.es_column} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: column = columns[(s.value.var,)] parent = column.es_column+"." prefix = len(parent) net_columns = [c for c in columns if c.es_column.startswith(parent) and c.type not in ["object", "nested"]] if not net_columns: pull = get_pull(column) if not column.nested_path: es_query.fields += [column.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(column.nested_path)[0], "put": {"name": s.name, "index": i, "child": "."} }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if not n.nested_path: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": {"name": s.name, "index": i, "child": n.es_column[prefix:]} }) i += 1 else: expr = s.value for v in expr.vars(): for n in columns: if n.name == v: if not n.nested_path: es_query.fields += [n.es_column] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Dict( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error( "There is more than one open-ended edge: self can not be handled" ) specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges) * len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": { "aggregate": "count" }, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note( "{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count=len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error( "not implemented yet" ) # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({ "name": fedge.domain.name, "value": parts[f] }) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter( {"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index] + ( special.dataIndex, ) + pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def get_pull(column): if column.nested_path: return "_inner" + column.es_column[ len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column)
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): canonical = select = Dict(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce(select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(frum, "_normalize_select"): return frum._normalize_select(canonical) output = [] if not select.value or select.value == ".": output.extend( [set_default({"name": c.name, "value": jx_expression(c.name)}, canonical) for c in frum.get_leaves()] ) elif isinstance(select.value, basestring): if select.value.endswith(".*"): base_name = select.value[:-2] canonical.name = coalesce(select.name, base_name, select.aggregate) value = jx_expression(select[:-2]) if not isinstance(value, Variable): Log.error("`*` over general expression not supported yet") output.append( [ set_default( { "name": base_name, "value": LeavesOp("leaves", value), "format": "dict", # MARKUP FOR DECODING }, canonical, ) for c in frum.get_columns() if c.type not in ["object", "nested"] ] ) else: output.extend( [ set_default( { "name": base_name + "." + literal_field(c.name[len(base_name) + 1 :]), "value": jx_expression(c.name), }, canonical, ) for c in frum.get_leaves() if c.name.startswith(base_name + ".") ] ) else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def __init__(self, edge, query, limit): self.start = None self.edge = edge self.name = literal_field(self.edge.name) self.query = query self.limit = limit
def _normalize(esfilter): """ TODO: DO NOT USE Dicts, WE ARE SPENDING TOO MUCH TIME WRAPPING/UNWRAPPING REALLY, WE JUST COLLAPSE CASCADING `and` AND `or` FILTERS """ if esfilter is TRUE_FILTER or esfilter is FALSE_FILTER or esfilter.isNormal: return esfilter # Log.note("from: " + convert.value2json(esfilter)) isDiff = True while isDiff: isDiff = False if esfilter["and"] != None: terms = esfilter["and"] # MERGE range FILTER WITH SAME FIELD for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 >= i1: continue # SAME, IGNORE try: f0, tt0 = t0.range.items()[0] f1, tt1 = t1.range.items()[0] if f0 == f1: set_default(terms[i0].range[literal_field(f1)], tt1) terms[i1] = True except Exception, e: pass output = [] for a in terms: if isinstance(a, (list, set)): from pyLibrary.debugs.logs import Log Log.error("and clause is not allowed a list inside a list") a_ = normalize_esfilter(a) if a_ is not a: isDiff = True a = a_ if a == TRUE_FILTER: isDiff = True continue if a == FALSE_FILTER: return FALSE_FILTER if a.get("and"): isDiff = True a.isNormal = None output.extend(a.get("and")) else: a.isNormal = None output.append(a) if not output: return TRUE_FILTER elif len(output) == 1: # output[0].isNormal = True esfilter = output[0] break elif isDiff: esfilter = wrap({"and": output}) continue if esfilter["or"] != None: output = [] for a in esfilter["or"]: a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a == TRUE_FILTER: return TRUE_FILTER if a == FALSE_FILTER: isDiff = True continue if a.get("or"): a.isNormal = None isDiff = True output.extend(a["or"]) else: a.isNormal = None output.append(a) if not output: return FALSE_FILTER elif len(output) == 1: esfilter = output[0] break elif isDiff: esfilter = wrap({"or": output}) continue if esfilter.term != None: if esfilter.term.keys(): esfilter.isNormal = True return esfilter else: return TRUE_FILTER if esfilter.terms != None: for k, v in esfilter.terms.items(): if len(v) > 0: if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: return { "or": [ {"missing": {"field": k}}, {"terms": {k: rest}} ], "isNormal": True } else: return { "missing": {"field": k}, "isNormal": True } else: esfilter.isNormal = True return esfilter return FALSE_FILTER if esfilter["not"] != None: _sub = esfilter["not"] sub = _normalize(_sub) if sub is FALSE_FILTER: return TRUE_FILTER elif sub is TRUE_FILTER: return FALSE_FILTER elif sub is not _sub: sub.isNormal = None return wrap({"not": sub, "isNormal": True}) else: sub.isNormal = None
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): canonical = select = Dict(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce( select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(frum, "_normalize_select"): return frum._normalize_select(canonical) output = [] if not select.value or select.value == ".": output.extend([ set_default({ "name": c.name, "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() ]) elif isinstance(select.value, basestring): if select.value.endswith(".*"): base_name = select.value[:-2] canonical.name = coalesce(select.name, base_name, select.aggregate) value = jx_expression(select[:-2]) if not isinstance(value, Variable): Log.error("`*` over general expression not supported yet") output.append([ set_default( { "name": base_name, "value": LeavesOp("leaves", value), "format": "dict" # MARKUP FOR DECODING }, canonical) for c in frum.get_columns() if c.type not in ["object", "nested"] ]) else: output.extend([ set_default( { "name": base_name + "." + literal_field(c.name[len(base_name) + 1:]), "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() if c.name.startswith(base_name + ".") ]) else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def test_start(self, log): if isinstance(log.test, list): log.test = " ".join(log.test) self.tests[literal_field(log.test)] = Dict(test=log.test, start_time=log.time) self.last_subtest = log.time
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in STRUCT and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): term = s.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": Variable(n), "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = term.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": Variable(c), "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": Variable(n), "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value.var)) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) Log.warning("Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def get_pull(column): if len(column.nested_path) != 1: return "_inner" + column.es_column[len(column.nested_path[0]):] else: return "fields." + literal_field(column.es_column)
def get_pull(column): if column.nested_path: return "_inner" + column.es_column[len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in ["object", "nested"] and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value.var, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)