def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN(len(deep_path), len(path)) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if isinstance(field_name, Mapping) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): if len(split_field(field_name)) == 1: return [(d[field_name], ) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif isinstance(field_name, list): paths = [_select_a_field(f) for f in field_name] output = DictList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = DictList() _tuple((), data, paths, 0, output) return output
class _Stats(WindowFunction): """ TRACK STATS, BUT IGNORE OUTLIERS """ def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = DictList() def add(self, value): if value == None: return self.samples.append(value) def sub(self, value): if value == None: return self.samples.remove(value) def merge(self, agg): Log.error("Do not know how to handle") def end(self): ignore = Math.ceiling(len(self.samples) * (1 - self.middle) / 2) if ignore * 2 >= len(self.samples): return stats.Stats() output = stats.Stats(samples=sorted(self.samples)[ignore:len(self.samples) - ignore:]) output.samples = list(self.samples) return output
def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = DictList() self.map = dict() self.map[None] = self.NULL
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], basestring): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = ("value", ) self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if isinstance(desc.partitions, list): self.partitions = desc.partitions.copy() else: Log.error("expecting a list of partitions")
def select(self, fields): if isinstance(fields, Mapping): fields=fields.value if isinstance(fields, basestring): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = DictList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = DictList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Dict() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Dict() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output
def right(self, num=None): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE RIGHT """ self._convert() if num == None: return DictList([self.list[-1]]) if num <= 0: return Null return DictList(self.list[-num])
def not_right(self, num): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE LEFT [:-num:] """ self._convert() if num == None: return DictList([self.list[:-1:]]) if num <= 0: return Null return DictList(self.list[:-num:])
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() for s in query.select: selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) select = aggregates[s0.aggregate].replace("{{code}}", s0.value) + " AS " + self.db.quote_column(s0.name) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output}
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search( { "fields": listwrap(schema._routing.path), "query": { "filtered": {"query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter()} }, "size": 200000, } ) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append( { "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)]), } } ) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode("utf-8") response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"} ) if response.errors: Log.error( "could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)], )
class DefaultDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "limit"] def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = DictList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') def compare(self, a, b): return value_compare(a.value, b.value) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getPartByKey(self, key): canonical = self.map.get(key) if canonical: return canonical canonical = Dict(name=key, value=key) self.partitions.append(canonical) self.map[key] = canonical return canonical # def getIndexByKey(self, key): # return self.map.get(key).dataIndex; def getKey(self, part): return part.value def getEnd(self, part): return part.value def getLabel(self, part): return part.value def as_dict(self): output = Domain.as_dict(self) output.partitions = self.partitions output.limit = self.limit return output
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) keys = listwrap(keys) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"field": s, "sort": 1}) else: output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = DictList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part, ) + deep) return output
def groupby_size(data, size): if hasattr(data, "next"): iterator = data elif hasattr(data, "__iter__"): iterator = data.__iter__() else: Log.error("do not know how to handle this type") done = DictList() def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output # THIS IS LAZY i = 0 while True: output = more() yield (i, output) if len(done) > 0: break i += 1
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = { "script": qb_expression_to_ruby(s.value) } return extract_rows(es, es_query, source, select, query)
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields = None source = "_source" elif s == ".": es_query.fields = None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] return extract_rows(es, es_query, source, select, query)
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = DictList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part,) + deep) return output
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(query.where.to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = DictList() return extract_rows(es, es_query, query)
def _iter(): g = 0 out = DictList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = DictList() if out: yield g, out except Exception, e: if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside qb.groupby", e)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], basestring): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = ("value", ) self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if isinstance(desc.partitions, list): self.partitions = desc.partitions.copy() else: Log.error("expecting a list of partitions")
def _tuple(template, data, fields, depth, output): deep_path = None deep_fields = DictList() for d in data: record = template for f in fields: index, children, record = _tuple_deep(d, f, depth, record) if index: path = f.value[0:index:] deep_fields.append(f) if deep_path and path != deep_path: Log.error("Dangerous to select into more than one branch at time") if not children: output.append(record) else: _tuple(record, children, deep_fields, depth + 1, output) return output
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) # GET IDS OF DOCUMENTS results = self._es.search({ "fields": [], "query": { "filtered": { "query": { "match_all": {} }, "filter": _normalize_where(command.where, self) } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") scripts.append("ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v) + ";\n") script = "".join(scripts) if results.hits.hits: command = [] for id in results.hits.hits._id: command.append({"update": {"_id": id}}) command.append({"script": script}) content = ("\n".join(convert.value2json(c) for c in command) + "\n").encode('utf-8') self._es.cluster._post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"})
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = DictList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception, e: Log.error("", e)
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def select(data, field_name): """ return list with values from field_name """ if isinstance(data, Cube): return data._select(_normalize_selects(field_name)) if isinstance(data, FlatList): return data.select(field_name) if isinstance(data, UniqueIndex): data = data._data.values( ) # THE SELECT ROUTINE REQUIRES dicts, NOT Dict WHILE ITERATING if isinstance(data, Mapping): return select_one(data, field_name) if isinstance(field_name, Mapping): field_name = wrap(field_name) if field_name.value in ["*", "."]: return data if field_name.value: # SIMPLIFY {"value":value} AS STRING field_name = field_name.value # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): path = split_field(field_name) if len(path) == 1: return DictList([d[field_name] for d in data]) else: output = DictList() flat_list._select1(data, path, 0, output) return output elif isinstance(field_name, list): keys = [_select_a_field(wrap(f)) for f in field_name] return _select(Dict(), unwrap(data), keys, 0) else: keys = [_select_a_field(field_name)] return _select(Dict(), unwrap(data), keys, 0)
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output}
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add( f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error( "Dangerous to select into more than one branch at time" ) if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception, e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = DictList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = DictList( [unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort==None: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)}) return output
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) if not isinstance(keys, (tuple, list)): keys = (keys,) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m # m = object.__new__(Dict) # object.__setattr__(m, "_dict", v) # return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif list(set(s.values()))[0] == "desc" and not s.sort and not s.value: for v, d in s.items(): output.append({"value": v, "sort": -1}) else: output.append({"value": coalesce(s.value, s.field), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": jx_expression(s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort == None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({ "value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1) }) return wrap(output)
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort==None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE_FILTER: return True if filter is FALSE_FILTER: return False filter = wrap(filter) if filter["and"]: result = True output = DictList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = DictList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if isinstance(filter.missing, basestring): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if isinstance(filter["exists"], basestring): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter})
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() groups = DictList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(e.value + " AS " + self.db.quote_column(e.name)) for s in select: selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = DictList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(v + " AS " + self.db.quote_column(s.name + "." + k)) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(s.value + " AS " + self.db.quote_column(s.name + "," + str(i))) else: selects.append(s.value + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = query.select.value + " AS " + self.db.quote_column(name) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, basestring)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = DictList()
def parse_properties(parent_index_name, parent_query_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = DictList() for name, property in esProperties.items(): if parent_query_path: index_name, query_path = parent_index_name, join_field(split_field(parent_query_path) + [name]) else: index_name, query_path = parent_index_name, name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, query_path, property.properties) for c in self_columns: c.nested_path = unwraplist([query_path] + listwrap(c.nested_path)) columns.extend(self_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="nested", nested_path=query_path )) continue if property.properties: child_columns = parse_properties(index_name, query_path, property.properties) columns.extend(child_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, name=query_path + "\\." + n, es_column=query_path + "\\." + n, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( table=index_name, es_index=index_name, es_column=query_path, name=query_path, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def decode(json): """ THIS IS CURRENTLY 50% SLOWER THAN PyPy DEFAULT IMPLEMENTATION THE INTENT IS TO NEVER ACTUALLY PARSE ARRAYS OF PRIMITIVE VALUES, RATHER FIND THE START AND END OF THOSE ARRAYS AND SIMPLY STRING COPY THEM TO THE INEVITABLE JSON OUTPUT """ var = "" curr = DictList() mode = ARRAY stack = DictList() # FIRST PASS SIMPLY GETS STRUCTURE i = 0 while i < len(json): c = json[i] i += 1 if mode == ARRAY: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == "]": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr.append(arr) curr = arr mode = ARRAY else: curr.append(arr) elif c == "{": obj = {} stack.append(curr) curr.append(obj) curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr.children.append(val) else: i, val = parse_const(i, json) elif mode == OBJECT: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == ":": mode = VALUE elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "\"": i, var = fast_parse_string(i, json) elif mode == VALUE: if c in [" ", "\t", "\n", "\r"]: pass elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr[var] = arr curr = arr mode = ARRAY else: curr[var] = arr mode = OBJECT elif c == "{": obj = {} stack.append(curr) curr[var] = obj curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr[var] = val mode = OBJECT else: i, val = parse_const(i, json) curr[var] = val mode = OBJECT return curr[0]
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = DictList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_keyword(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}