def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
class _Stats(WindowFunction): """ TRACK STATS, BUT IGNORE OUTLIERS """ def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = FlatList() def add(self, value): if value == None: return self.samples.append(value) def sub(self, value): if value == None: return self.samples.remove(value) def merge(self, agg): Log.error("Do not know how to handle") def end(self): ignore = Math.ceiling(len(self.samples) * (1 - self.middle) / 2) if ignore * 2 >= len(self.samples): return stats.Stats() output = stats.Stats(samples=sorted(self.samples)[ignore:len(self.samples) - ignore:]) output.samples = list(self.samples) return output
def more(): output = FlatList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() for s in query.select: selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value),quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) select = sql_alias(aggregates[s0.aggregate].replace("{{code}}", s0.value) , quote_column(s0.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
class DefaultDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "limit", "sort"] def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = FlatList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') self.sort = 1 def compare(self, a, b): return value_compare(a.value, b.value) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getPartByKey(self, key): canonical = self.map.get(key) if canonical: return canonical canonical = Data(name=key, value=key) self.partitions.append(canonical) self.map[key] = canonical return canonical # def getIndexByKey(self, key): # return self.map.get(key).dataIndex; def getKey(self, part): return part.value def getEnd(self, part): return part.value def getLabel(self, part): return part.value def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions output.limit = self.limit return output
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, text_type): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_variable_name(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output} elif where["or"]: return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]} elif where["and"]: return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]} elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = FlatList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part,) + deep) return output
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search({ "fields": listwrap(schema._routing.path), "query": {"filtered": { "query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter() }}, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_variable_name(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"consistency": self.settings.consistency} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = FlatList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') self.sort = 1
def _iter(): g = 0 out = FlatList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = FlatList() if out: yield g, out except Exception as e: e = Except.wrap(e) if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside jx.groupby", e)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, basestring)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name")
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = FlatList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Data(v) return m # m = object.__new__(Data) # object.__setattr__(m, "_dict", v) # return m elif type_ is NoneType: return Null elif type_ is list: return FlatList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def addParts(parentPart, childPath, count, index): """ BUILD A hierarchy BY REPEATEDLY CALLING self METHOD WITH VARIOUS childPaths count IS THE NUMBER FOUND FOR self PATH """ if index == None: index = 0 if index == len(childPath): return c = childPath[index] parentPart.count = coalesce(parentPart.count, 0) + count if parentPart.partitions == None: parentPart.partitions = FlatList() for i, part in enumerate(parentPart.partitions): if part.name == c.name: addParts(part, childPath, count, index + 1) return parentPart.partitions.append(c) addParts(c, childPath, count, index + 1)
def listwrap(value): """ PERFORMS THE FOLLOWING TRANSLATION None -> [] value -> [value] [...] -> [...] (unchanged list) ## MOTIVATION ## OFTEN IT IS NICE TO ALLOW FUNCTION PARAMETERS TO BE ASSIGNED A VALUE, OR A list-OF-VALUES, OR NULL. CHECKING FOR WHICH THE CALLER USED IS TEDIOUS. INSTEAD WE CAST FROM THOSE THREE CASES TO THE SINGLE CASE OF A LIST # BEFORE def do_it(a): if a is None: return if not isinstance(a, list): a=[a] for x in a: # do something # AFTER def do_it(a): for x in listwrap(a): # do something """ if value == None: return FlatList() elif is_list(value): if isinstance(value, list): return list_to_data(value) else: return value elif is_many(value): return list_to_data(list(value)) else: return list_to_data([from_data(value)])
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_properties() # GET IDS OF DOCUMENTS results = self._es.search({ "stored_fields": listwrap(schema._routing.path), "query": {"bool": { "filter": jx_expression(command.where).to_esfilter(Null) }}, "size": 10000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_variable_name(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: v = scrub(v) scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_painless(schema).script(schema)}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"wait_for_active_shards": self.settings.wait_for_active_shards} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def test_wrap_3(): switch = [ lambda: Random.string(20), lambda: { "i": Random.int(2000) }, lambda: Data(i=Random.int(2000)), lambda: FlatList([{ "i": Random.int(2000) }]), lambda: [{ "i": Random.int(2000) }] ] inputs = [ switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT) ] for i in range(NUM_REPEAT): results = [] gc.collect() with Profiler("more string: slow_wrap"): for v in inputs: results.append(slow_wrap(v)) results = [] gc.collect() with Profiler("more string: wrap"): for v in inputs: results.append(wrap(v)) results = [] gc.collect() with Profiler("more string: baseline"): for v in inputs: results.append(baseline(v)) Log.note("Done {{i}} of {{num}}", {"i": i, "num": NUM_REPEAT})
def _iter(): g = 0 out = FlatList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = FlatList() if out: yield g, out except Exception, e: e = Except.wrap(e) if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside jx.groupby", e)
def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = FlatList()
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions) == 0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all( desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value) - {None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = FlatList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(v + " AS " + self.db.quote_column(s.name + "." + k)) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(s.value + " AS " + self.db.quote_column(s.name + "," + str(i))) else: selects.append(s.value + " AS " + self.db.quote_column(s.name)) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = query.select.value + " AS " + self.db.quote_column( name) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql) ] # RETURNING LIST OF VALUES
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error( "Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() groups = FlatList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(e.value + " AS " + self.db.quote_column(e.name)) for s in select: selects.append( aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked
def temp(term): return FlatList([edge0.domain.getPartByKey(term)])
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort==None: return FlatList.EMPTY output = FlatList() for s in listwrap(sort): if isinstance(s, text_type): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif not s.sort and not s.value and all(d in sort_direction for d in s.values()): for v, d in s.items(): output.append({"value": jx_expression(v), "sort": sort_direction[d]}) elif not s.sort and not s.value: Log.error("`sort` clause must have a `value` property") else: output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)}) return output
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = FlatList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_variable_name(dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_variable_name(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_variable_name(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)} }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [c.es_column] child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() groups = FlatList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(sql_alias(e.value, quote_column(e.name))) for s in select: selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value), quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = FlatList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(sql_alias(v, quote_column(s.name + "." + k))) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(sql_alias(s.value, quote_column(s.name + "," + str(i)))) else: selects.append(sql_alias(s.value, quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = sql_alias(query.select.value, quote_column(name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({ "and": [query.where, { "term": { query.edges[0].value: v } }] }) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{ "name": v, "value": v } for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def parse_properties(parent_index_name, parent_name, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = FlatList() for name, property in esProperties.items(): index_name = parent_index_name column_name = concat_field(parent_name, name) if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, column_name, property.properties) for c in self_columns: c.nested_path = [column_name] + c.nested_path columns.extend(self_columns) columns.append(Column( es_index=index_name, es_column=column_name, names={".": column_name}, type="nested", nested_path=ROOT_PATH )) continue if property.properties: child_columns = parse_properties(index_name, column_name, property.properties) columns.extend(child_columns) columns.append(Column( names={".": column_name}, es_index=index_name, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, es_column=column_name, name=column_name, nested_path=ROOT_PATH, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, es_column=column_name + "\\." + n, name=column_name + "\\." + n, nested_path=ROOT_PATH, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( es_index=index_name, names={".": column_name}, es_column=column_name, nested_path=ROOT_PATH, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( es_index=index_name, es_column=column_name, names={".":column_name}, nested_path=ROOT_PATH, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( es_index=index_name, names={".": column_name}, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Data() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema columns = schema.columns leaf_columns = set( c.names["."] for c in columns if c.type not in STRUCT and ( c.nested_path[0] == "." or c.es_column == c.nested_path[0])) nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") i = 0 source = "fields" for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp): new_name_prefix = select.name + "\\." if select.name != "." else "" term = select.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" for cname, cs in schema.lookup.items(): for c in cs: if c.type not in STRUCT and c.es_column != "_id": new_name = new_name_prefix + literal_field( cname) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": { "name": new_name, "index": i, "child": "." } }) i += 1 else: prefix = term.var + "." prefix_length = len(prefix) for cname, cs in schema.lookup.items(): if cname.startswith(prefix): suffix = cname[prefix_length:] for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_name = new_name_prefix + literal_field( suffix) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": { "name": new_name, "index": i, "child": "." } }) i += 1 elif isinstance(select.value, Variable): if select.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) i += 1 elif select.value.var == "_id": new_select.append({ "name": select.name, "value": select.value, "pull": "_id", "put": { "name": select.name, "index": i, "child": "." } }) i += 1 elif select.value.var in nested_columns or [ c for c in nested_columns if c.startswith(select.value.var + ".") ]: es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) i += 1 else: prefix = select.value.var + "." prefix_length = len(prefix) net_columns = [c for c in leaf_columns if c.startswith(prefix)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(select.value.var) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) i += 1 else: # LEAVES OF OBJECT for cname, cs in schema.lookup.items(): if cname.startswith(prefix): for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": i, "child": cname[prefix_length:] } }) i += 1 else: es_query.script_fields[literal_field(select.name)] = { "script": select.value.to_ruby() } new_select.append({ "name": select.name, "pull": "fields." + literal_field(select.name), "put": { "name": select.name, "index": i, "child": "." } }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = concat_field("_source", n.value.var) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = FlatList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_variable_name( dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append( {"missing": { "field": dimension.fields[0] }}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_variable_name(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name=k, value=v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_variable_name( dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_variable_name(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_variable_name(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_variable_name(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es_post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort == None: return FlatList.EMPTY output = FlatList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({ "value": jx_expression(v), "sort": sort_direction[d] }) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in STRUCT and ( c.nested_path[0] == "." or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if len(c.nested_path) != 1) i = 0 source = "fields" for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp): term = select.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(selects.name) - {"_id"} for n in net_columns: new_select.append({ "name": n, "value": Variable(n), "put": { "name": n, "index": i, "child": "." } }) i += 1 else: parent = term.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": select.name + "." + c[prefix:], "value": Variable(c), "put": { "name": select.name + "." + c[prefix:], "index": i, "child": "." } }) i += 1 elif isinstance(select.value, Variable): if select.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) i += 1 elif select.value.var == "_id": new_select.append({ "name": select.name, "value": select.value, "pull": "_id", "put": { "name": select.name, "index": i, "child": "." } }) i += 1 elif select.value.var in nested_columns or [ c for c in nested_columns if c.startswith(select.value.var + ".") ]: es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) i += 1 else: parent = select.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(select.value.var) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": i, "child": "." } }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": select.name, "value": Variable(n), "put": { "name": select.name, "index": i, "child": n[prefix:] } }) i += 1 else: es_query.script_fields[literal_field(select.name)] = { "script": select.value.to_ruby() } new_select.append({ "name": select.name, "pull": "fields." + literal_field(select.name), "put": { "name": select.name, "index": i, "child": "." } }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value.var)) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema columns = schema.columns leaf_columns = set(c.names["."] for c in columns if c.type not in STRUCT and (c.nested_path[0] == "." or c.es_column == c.nested_path[0])) nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") i = 0 source = "fields" for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp): new_name_prefix = select.name + "\\." if select.name != "." else "" term = select.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" for cname, cs in schema.lookup.items(): for c in cs: if c.type not in STRUCT and c.es_column != "_id": new_name = new_name_prefix + literal_field(cname) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": {"name": new_name, "index": i, "child": "."} }) i += 1 else: prefix = term.var + "." prefix_length = len(prefix) for cname, cs in schema.lookup.items(): if cname.startswith(prefix): suffix = cname[prefix_length:] for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_name = new_name_prefix + literal_field(suffix) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": {"name": new_name, "index": i, "child": "."} }) i += 1 elif isinstance(select.value, Variable): if select.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var == "_id": new_select.append({ "name": select.name, "value": select.value, "pull": "_id", "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var in nested_columns or [c for c in nested_columns if c.startswith(select.value.var+".")]: es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 else: prefix = select.value.var + "." prefix_length = len(prefix) net_columns = [c for c in leaf_columns if c.startswith(prefix)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(encode_property(select.value.var)) new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 else: # LEAVES OF OBJECT for cname, cs in schema.lookup.items(): if cname.startswith(prefix): for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": i, "child": cname[prefix_length:]} }) i += 1 else: es_query.script_fields[literal_field(select.name)] = {"script": select.value.to_ruby()} new_select.append({ "name": select.name, "pull": "fields." + literal_field(select.name), "put": {"name": select.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = concat_field("_source", n.value.var) elif isinstance(n.value, Variable): n.pull = concat_field("fields", literal_field(encode_property(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def temp(term): terms = term.split('|') output = FlatList([t2p(t) for t, t2p in zip(terms, fromTerm2Part)]) return output
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Data() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error( "Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() for s in query.select: selects.append( aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[ 0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error( "Expecting all columns to have an aggregate: {{select}}", select=s0) select = aggregates[s0.aggregate].replace( "{{code}}", s0.value) + " AS " + self.db.quote_column(s0.name) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance( select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." }, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." } }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) } }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)])) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [ c.es_column ] child = relative_field( untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc( nested_path, Variable( relative_field( s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[ nested_path].nested.inner_hits.stored_fields += [ c.es_column ] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script( painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
class SimpleSetDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions) == 0 or isinstance(desc.partitions[0], (text_type, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i if isinstance(p, (int, float)): text_part = text_type( float(p)) # ES CAN NOT HANDLE NUMERIC PARTS self.map[text_part] = part self.order[text_part] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all( desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value) - {None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception as e: Log.error("problem", e) def getPartByKey(self, key): try: canonical = self.map.get(key) if not canonical: return self.NULL return canonical except Exception as e: Log.error("problem", e) def getPartByIndex(self, index): return self.partitions[index] def getKeyByIndex(self, index): if index < 0 or index >= len(self.partitions): return None return self.partitions[index][self.key] def getKey(self, part): return part[self.key] def getEnd(self, part): if self.value: return part[self.value] else: return part def getLabel(self, part): return part[self.label] def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions return output
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception as e: Log.error("", e) else: Log.error("deeper than 2 is not supported yet") return Domain( type=self.type, name=self.name, partitions=wrap(partitions), min=self.min, max=self.max, interval=self.interval, # THE COMPLICATION IS THAT SOMETIMES WE WANT SIMPLE PARTITIONS, LIKE # STRINGS, DATES, OR NUMBERS. OTHER TIMES WE WANT PARTITION OBJECTS # WITH NAME, VALUE, AND OTHER MARKUP. # USUALLY A "set" IS MEANT TO BE SIMPLE, BUT THE end() FUNCTION IS # OVERRIDES EVERYTHING AND IS EXPLICIT. - NOT A GOOD SOLUTION BECAUSE # end() IS USED BOTH TO INDICATE THE QUERY PARTITIONS *AND* DISPLAY # COORDINATES ON CHARTS # PLEASE SPLIT end() INTO value() (replacing the string value) AND # label() (for presentation) value="name" if not self.value and self.partitions else self.value, key="value", label=coalesce(self.label, (self.type == "set" and self.name)), end=coalesce(self.end, (self.type == "set" and self.name)), isFacet=self.isFacet, dimension=self )
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, basestring)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce( kwargs.depth, len(self.fields) - 1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name": part.name, "value": part.value, "where": part.where, "style": coalesce(part.style, part.parent.style), "weight": part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name": v.name, "value": v.value, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) ] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name": join_field( split_field(subpart.parent.name) + [subpart.name]), "value": subpart.value, "where": subpart.where, "style": coalesce(subpart.style, subpart.parent.style), "weight": subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
class SimpleSetDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception as e: Log.error("problem", e) def getPartByKey(self, key): try: canonical = self.map.get(key) if not canonical: return self.NULL return canonical except Exception as e: Log.error("problem", e) def getPartByIndex(self, index): return self.partitions[index] def getKeyByIndex(self, index): if index < 0 or index >= len(self.partitions): return None return self.partitions[index][self.key] def getKey(self, part): return part[self.key] def getEnd(self, part): if self.value: return part[self.value] else: return part def getLabel(self, part): return part[self.label] def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions return output
def parse_properties(parent_index_name, parent_name, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = FlatList() for name, property in esProperties.items(): index_name = parent_index_name column_name = join_field(split_field(parent_name) + [name]) if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, column_name, property.properties) for c in self_columns: c.nested_path = [column_name] + c.nested_path columns.extend(self_columns) columns.append(Column( es_index=index_name, names={index_name: column_name}, es_column=column_name, type="nested", nested_path=ROOT_PATH )) continue if property.properties: child_columns = parse_properties(index_name, column_name, property.properties) columns.extend(child_columns) columns.append(Column( names={index_name: column_name}, es_index=index_name, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, nested_path=ROOT_PATH, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, name=column_name + "\\." + n, es_column=column_name + "\\." + n, nested_path=ROOT_PATH, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( es_index=index_name, names={index_name: column_name}, es_column=column_name, nested_path=ROOT_PATH, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( table=index_name, es_index=index_name, es_column=column_name, name=column_name, nested_path=ROOT_PATH, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( es_index=index_name, names={index_name: column_name}, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance( fields, list) and len(fields) == 1 and is_variable_name( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output} elif where["or"]: return { "or": [ unwrap(_where_terms(master, vv, schema)) for vv in where["or"] ] } elif where["and"]: return { "and": [ unwrap(_where_terms(master, vv, schema)) for vv in where["and"] ] } elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, basestring)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception as e: Log.error("problem", e) def getPartByKey(self, key): try: canonical = self.map.get(key, None) if not canonical: return self.NULL return canonical except Exception as e: Log.error("problem", e) def getKey(self, part): return part[self.key] def getKeyByIndex(self, index): return self.partitions[index][self.key] def getEnd(self, part): if self.value: return part[self.value] else: return part def getLabel(self, part): return part[self.label] def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions return output
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": script[0].partial_eval().to_es_script(schema).script(schema)} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)