def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum output = DictList() for e in coalesce(query.edges, query.groupby, []): if e.value: e = e.copy() e.value = qb_expression(e.value) vars_ = e.value.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) e.value = e.value.map({schema[v].name: schema[v].abs_name for v in vars_}) else: vars_ = e.domain.dimension.fields e.domain.dimension = e.domain.dimension.copy() e.domain.dimension.fields = [schema[v].abs_name for v in vars_] depths = set(len(listwrap(schema[v].nested_path)) for v in vars_) if len(depths) > 1: Log.error("expression {{expr}} spans tables, can not handle", expr=e.value) depth = list(depths)[0] while len(output) <= depth: output.append([]) output[depth].append(AggsDecoder(e, query)) return output
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(qb_expression(query.where).to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() return extract_rows(es, es_query, query)
def where(self, where): if isinstance(where, Mapping): temp = None exec("def temp(row):\n return "+qb_expression(where).to_python()) else: temp = where return ListContainer("from "+self.name, filter(temp, self.data), self.schema)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search({ "fields": listwrap(schema._routing.path), "query": {"filtered": { "query": {"match_all": {}}, "filter": _normalize_where(qb_expression(command.where).to_esfilter(), self) }}, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + qb_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = DictList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception, e: Log.error("", e)
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR Qb INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges)] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def update(self, command): """ EXPECTING command == {"set":term, "clear":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS A QB FILTER """ command = wrap(command) if command.where == None: filter_ = lambda: True else: filter_ = _exec("temp = lambda row: " + qb_expression(command.where).to_python()) for c in self.data: if filter_(c): for k in listwrap(command["clear"]): c[k] = None for k, v in command.set.items(): c[k] = v
def es_deepop(es, query): columns = query.frum.get_columns() query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_ = {c.name: c.abs_name for c in columns} map_to_local = { c.name: "_inner" + c.abs_name[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.abs_name) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(qb_expression(query.where), query.frum, map_) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.abs_name[len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.abs_name) i = 0 for s in listwrap(query.select): if s.value == "*": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.abs_name] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": c.name, "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.put.name = n.name.lstrip(".") elif s.value == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.abs_name] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": ".", "index": i, "child": c.abs_name} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in columns: if c.name.startswith(parent) and c.type not in ["object", "nested"]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.abs_name] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in columns if c.name.startswith(parent) and c.type not in ["object", "nested"]] if not net_columns: c = columns[(s.value,)] pull = get_pull(c) if not c.nested_path: es_query.fields += [s.value] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: pull = get_pull(n) if not n.nested_path: es_query.fields += [n.abs_name] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": {"name": s.name, "index": i, "child": n.name[prefix:]} }) i += 1 else: expr = qb_expression(s.value) for v in expr.vars(): for n in columns: if n.name==v: if not n.nested_path: es_query.fields += [n.abs_name] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Dict( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) }}, "sort": query.sort, "size": 1 }) elif all(map(is_keyword, select.value)): FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) }}, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(qb_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif all(map(is_keyword, select[0].value)): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Dict( meta={"esquery": FromES}, data=cube )
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() column_names = set(c.name for c in query.frum.get_columns() if c.type not in ["object"] and (not c.nested_path or c.abs_name == c.nested_path or not c.nested_path)) source = "fields" i = 0 for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": es_query.fields = None source = "_source" net_columns = column_names - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 elif s.value == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in column_names: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in column_names if c.startswith(parent)] if not net_columns: if es_query.fields is not None: es_query.fields.append(s.value) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 elif isinstance(s.value, list): Log.error("need an example") if es_query.fields is not None: es_query.fields.extend([v for v in s.value]) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression(s.value).to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_query = Dict() new_select = Dict() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif s.value == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: # TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: # TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif is_keyword(s.value) and s.aggregate == "count": s.value = coalesce(frum[s.value].abs_name, s.value) new_select["count_" + literal_field(s.value)] += [s] elif is_keyword(s.value): s.value = coalesce(frum[s.value].abs_name, s.value) new_select[literal_field(s.value)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value == ".": Log.error("do not know how to handle") else: field_name = representative.value # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = qb_expression(s.value).map({c.name: c.abs_name for c in frum._columns}) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = qb_expression(query.where).vars() map_ = {v: frum[v].abs_name for v in vars_} # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(qb_expression(query.where), schema=frum, map_=map_) if len(split_field(frum.name)) > 1: if any(split_where[2:]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: # TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({"aggs": {"_nested": set_default({"nested": {"path": frum.query_path}}, es_query)}}) else: if any(split_where[1:]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: # TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total ) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)