def _normalize_range(range): if range == None: return None return Data(min=None if range.min == None else jx_expression(range.min), max=None if range.max == None else jx_expression(range.max), mode=range.mode)
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if edge == None: Log.error("Edge has no value, or expression is empty") elif isinstance(edge, basestring): if schema: try: e = schema[edge] except Exception, _: e = None e = unwrap(unwraplist(e)) if e and not isinstance(e, (_Column, set, list)): if isinstance(e, _Column): return Data(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(domain=e, schema=schema)) elif isinstance(e.fields, list) and len(e.fields) == 1: return Data(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Data(name=e.name, allowNulls=True, domain=e.getDomain()) return Data(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema))
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort == None: return FlatList.EMPTY output = FlatList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({ "value": jx_expression(v), "sort": sort_direction[d] }) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def _normalize_group(edge, schema=None): if isinstance(edge, basestring): return wrap({ "name": edge, "value": jx_expression(edge), "allowNulls": True, "domain": { "type": "default" } }) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) return wrap({ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "domain": { "type": "default" } })
def _normalize_group(edge, dim_index, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if isinstance(edge, basestring): if edge.endswith(".*"): prefix = edge[:-1] if schema: output = wrap([{ "name": literal_field(k), "value": jx_expression(k), "allowNulls": True, "domain": { "type": "default" } } for k, cs in schema.items() if k.startswith(prefix) for c in cs if c.type not in STRUCT]) return output else: return wrap([{ "name": edge[:-2], "value": jx_expression(edge[:-2]), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }]) return wrap([{ "name": edge, "value": jx_expression(edge), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }])
def _normalize_range(range): if range == None: return None return Dict( min=None if range.min == None else jx_expression(range.min), max=None if range.max == None else jx_expression(range.max), )
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search( { "fields": listwrap(schema._routing.path), "query": { "filtered": {"query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter()} }, "size": 200000, } ) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append( { "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)]), } } ) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode("utf-8") response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"consistency": self.settings.consistency}, ) if response.errors: Log.error( "could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)], )
def _normalize_group(edge, dim_index, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if isinstance(edge, basestring): if edge.endswith(".*"): prefix = edge[:-1] if schema: output = wrap([ { "name": literal_field(k), "value": jx_expression(k), "allowNulls": True, "domain": {"type": "default"} } for k, cs in schema.items() if k.startswith(prefix) for c in cs if c.type not in STRUCT ]) return output else: return wrap([{ "name": edge[:-2], "value": jx_expression(edge[:-2]), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): select = Data(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".") else: return output elif isinstance(select.value, basestring): if select.value.endswith(".*"): output.name = coalesce(select.name, select.value[:-2], select.aggregate) output.value = LeavesOp("leaves", Variable(select.value[:-2])) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value) elif isinstance(select.value, (int, float)): if not output.name: output.name = unicode(select.value) output.value = jx_expression(select.value) else: output.value = jx_expression(select.value) if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if isinstance(edge, basestring): if schema: e = schema[edge] if e: if isinstance(e, _Column): return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Dict(name=e.name, allowNulls=True, domain=e.getDomain()) return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict(name=edge.name, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain) domain = _normalize_domain(edge.domain, schema=schema) return Dict(name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain)
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): select = Data(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".") else: return output elif isinstance(select.value, basestring): if select.value.endswith(".*"): output.name = coalesce(select.name, select.value[:-2], select.aggregate) output.value = LeavesOp("leaves", Variable(select.value[:-2])) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value) elif isinstance(select.value, (int, float)): if not output.name: output.name = unicode(select.value) output.value = jx_expression(select.value) else: output.value = jx_expression(select.value) if not output.name: Log.error("expecting select to have a name: {{select}}", select= select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def edges_get_all_vars(e): output = set() if isinstance(e.value, basestring): output.add(e.value) if e.domain.key: output.add(e.domain.key) if e.domain.where: output |= jx_expression(e.domain.where).vars() if e.range: output |= jx_expression(e.range.min).vars() output |= jx_expression(e.range.max).vars() if e.domain.partitions: for p in e.domain.partitions: if p.where: output |= p.where.vars() return output
def where(self, filter): """ WILL NOT PULL WHOLE OBJECT, JUST TOP-LEVEL PROPERTIES :param filter: jx_expression filter :return: list of objects that match """ select = [] column_names = [] for cname, cs in self.columns.items(): cs = [ c for c in cs if c.type not in ["nested", "object"] and not c.nested_path ] if len(cs) == 0: continue column_names.append(cname) if len(cs) == 1: select.append( quote_table(c.es_column) + " " + quote_table(c.name)) else: select.append("coalesce(" + ",".join(quote_table(c.es_column) for c in cs) + ") " + quote_table(c.name)) result = self.db.query(" SELECT " + "\n,".join(select) + " FROM " + quote_table(self.name) + " WHERE " + jx_expression(filter).to_sql()) return wrap([{c: v for c, v in zip(column_names, r)} for r in result.data])
def test_eq2(self): where = {"eq": {"a": 1, "b": 2}} result = simplify_esfilter(jx_expression(where).to_esfilter()) if USE_BOOL_MUST: self.assertEqual( result, {"bool": { "must": [{ "term": { "a": 1 } }, { "term": { "b": 2 } }] }}) else: self.assertEqual(result, {"and": [{ "term": { "a": 1 } }, { "term": { "b": 2 } }]})
def _groupby_op(self, query): selects = [] for s in listwrap(query.select): if s.value == "." and s.aggregate == "count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate] + "(" + jx_expression(s.value).to_sql() + ") AS " + quote_table(s.name)) for w in query.window: selects.append(self._window_op(self, query, w)) agg = " FROM " + quote_table(self.name) + " a\n" groupby = "" groupby_prefix = " GROUP BY " for i, e in enumerate(query.edges): value = e.to_sql() groupby += groupby_prefix + value groupby_prefix = ",\n" where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where + groupby
def where(self, filter): """ WILL NOT PULL WHOLE OBJECT, JUST TOP-LEVEL PROPERTIES :param filter: jx_expression filter :return: list of objects that match """ select = [] column_names = [] for cname, cs in self.columns.items(): cs = [c for c in cs if c.type not in ["nested", "object"] and not c.nested_path] if len(cs) == 0: continue column_names.append(cname) if len(cs) == 1: select.append(quote_table(c.es_column) + " " + quote_table(c.name)) else: select.append( "coalesce(" + ",".join(quote_table(c.es_column) for c in cs) + ") " + quote_table(c.name) ) result = self.db.query( " SELECT " + "\n,".join(select) + " FROM " + quote_table(self.name) + " WHERE " + jx_expression(filter).to_sql() ) return wrap([{c: v for c, v in zip(column_names, r)} for r in result.data])
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null accessor = jx_expression_to_function(jx_expression( {"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Dict(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Dict(group), data[start::] return _output() except Exception, e: Log.error("Problem grouping", cause=e)
def test_range_packing1(self): where = {"and": [ {"gt": {"a": 20}}, {"lt": {"a": 40}} ]} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"range": {"a": {"gt": 20, "lt": 40}}})
def test_range_packing2(self): where = {"and": [ {"gte": {"build.date": 1429747200}}, {"lt": {"build.date": 1429920000}} ]} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"range": {"build.date": {"gte": Date("23 APR 2015").unix, "lt": Date("25 APR 2015").unix}}})
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search({ "fields": listwrap(schema._routing.path), "query": {"filtered": { "query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter() }}, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"consistency": self.settings.consistency} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def test_value_not_a_variable(self): result = jx_expression({ "eq": { "result.test": "/XMLHttpRequest/send-entity-body-document.htm" } }).vars() expected = {"result.test"} self.assertEqual(result, expected, "expecting the one and only variable")
def wrap(query, schema=None): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if isinstance(query, QueryOp) or query == None: return query query = wrap(query) output = QueryOp("from", None) output.format = query.format output.frum = wrap_from(query["from"], schema=schema) if not schema and isinstance(output.frum, Schema): schema = output.frum if not schema and hasattr(output.frum, "schema"): schema = output.frum.schema if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = Data(name="count", value=jx_expression("."), aggregate="count", default=0) else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error( "You can not use both the `groupby` and `edges` clauses in the same query!" ) elif query.edges: output.edges = _normalize_edges(query.edges, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where(query.where, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) if not Math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def _normalize_group(edge, schema=None): if isinstance(edge, basestring): return wrap({"name": edge, "value": jx_expression(edge), "allowNulls": True, "domain": {"type": "default"}}) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) return wrap( { "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "domain": {"type": "default"}, } )
def where(self, where): temp = None if isinstance(where, Mapping): exec("def temp(row):\n return "+jx_expression(where).to_python()) elif isinstance(where, Expression): temp = compile_expression(where.to_python()) else: temp = where return ListContainer("from "+self.name, filter(temp, self.data), self.schema)
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null if any(isinstance(k, Expression) for k in keys): Log.error("can not handle expressions") else: accessor = jx_expression_to_function(jx_expression({"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception as e: Log.error("Problem grouping", cause=e)
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": jx_expression(s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append( {"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)} ) return output
def where(self, where): temp = None if isinstance(where, Mapping): exec("def temp(row):\n return " + jx_expression(where).to_python()) elif isinstance(where, Expression): temp = compile_expression(where.to_python()) else: temp = where return ListContainer("from " + self.name, filter(temp, self.data), self.schema)
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if edge == None: Log.error("Edge has no value, or expression is empty") elif isinstance(edge, basestring): if schema: try: e = schema[edge] except Exception, e: e = None e = unwraplist(e) if e and not isinstance(e, (_Column, set, list)): if isinstance(e, _Column): return Data( name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(domain=e, schema=schema) ) elif isinstance(e.fields, list) and len(e.fields) == 1: return Data( name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain() ) else: return Data( name=e.name, allowNulls=True, domain=e.getDomain() ) return Data( name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema) )
def _normalize_group(edge, schema=None): if isinstance(edge, basestring): if edge.endswith(".*"): prefix = edge[:-1] output = wrap([{ "name": literal_field(k), "value": jx_expression(c.es_column), "allowNulls": True, "domain": { "type": "default" } } for k, cs in schema.lookup.items() if k.startswith(prefix) for c in cs if c.type not in STRUCT]) return output return wrap([{ "name": edge, "value": jx_expression(edge), "allowNulls": True, "domain": { "type": "default" } }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "domain": { "type": "default" } }])
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if isinstance(edge, basestring): if schema: e = schema[edge] if e: if isinstance(e, _Column): return Dict( name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema) ) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Dict(name=e.name, allowNulls=True, domain=e.getDomain()) return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict(name=edge.name, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain) domain = _normalize_domain(edge.domain, schema=schema) return Dict( name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain, )
def _normalize_window(window, schema=None): v = window.value try: expr = jx_expression(v) except Exception: expr = ScriptOp("script", v) return Dict( name=coalesce(window.name, window.value), value=expr, edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)], sort=_normalize_sort(window.sort), aggregate=window.aggregate, range=_normalize_range(window.range), where=_normalize_where(window.where, schema=schema))
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def wrap(query, schema=None): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if isinstance(query, QueryOp) or query == None: return query query = wrap(query) output = QueryOp("from", None) output.format = query.format output.frum = wrap_from(query["from"], schema=schema) if not schema and isinstance(output.frum, Schema): schema = output.frum if not schema and hasattr(output.frum, "schema"): schema = output.frum.schema if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = Data(name="count", value=jx_expression("."), aggregate="count", default=0) else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error("You can not use both the `groupby` and `edges` clauses in the same query!") elif query.edges: output.edges = _normalize_edges(query.edges, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where(query.where, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) if not Math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def _normalize_window(window, schema=None): v = window.value try: expr = jx_expression(v) except Exception: expr = ScriptOp("script", v) return Dict( name=coalesce(window.name, window.value), value=expr, edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)], sort=_normalize_sort(window.sort), aggregate=window.aggregate, range=_normalize_range(window.range), where=_normalize_where(window.where, schema=schema), )
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def _edges_op(self, query): selects = [] for s in listwrap(query.select): if s.value == "." and s.aggregate == "count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate] + "(" + jx_expression(s.value).to_sql() + ") AS " + quote_table(s.name)) for w in query.window: selects.append(self._window_op(self, query, w)) agg_prefix = " FROM " agg_suffix = "\n" agg = "" ons = [] groupby = "" groupby_prefix = "\nGROUP BY " for i, e in enumerate(query.edges): edge_alias = "e" + unicode(i) edge_value = e.value.to_sql() value = edge_value for v in e.value.vars(): value = value.replace(quote_table(v), "a." + quote_table(v)) edge_name = quote_table(e.name) selects.append(edge_alias + "." + edge_name + " AS " + edge_name) agg += \ agg_prefix + "(" + \ "SELECT DISTINCT " + edge_value + " AS " + edge_name + " FROM " + quote_table(self.name) + \ ") " + edge_alias + \ agg_suffix agg_prefix = " LEFT JOIN " agg_suffix = " ON 1=1\n" ons.append(edge_alias + "." + edge_name + " = " + value) groupby += groupby_prefix + edge_alias + "." + edge_name groupby_prefix = ",\n" agg += agg_prefix + quote_table( self.name) + " a ON " + " AND ".join(ons) where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where + groupby
def _edges_op(self, query): selects = [] for s in listwrap(query.select): if s.value=="." and s.aggregate=="count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate]+"("+jx_expression(s.value).to_sql() + ") AS " + quote_table(s.name)) for w in query.window: selects.append(self._window_op(self, query, w)) agg_prefix = " FROM " agg_suffix = "\n" agg = "" ons = [] groupby = "" groupby_prefix = "\nGROUP BY " for i, e in enumerate(query.edges): edge_alias = "e" + unicode(i) edge_value = e.value.to_sql() value = edge_value for v in e.value.vars(): value = value.replace(quote_table(v), "a."+quote_table(v)) edge_name = quote_table(e.name) selects.append(edge_alias + "." + edge_name + " AS " + edge_name) agg += \ agg_prefix + "(" + \ "SELECT DISTINCT " + edge_value + " AS " + edge_name + " FROM " + quote_table(self.name) + \ ") " + edge_alias + \ agg_suffix agg_prefix = " LEFT JOIN " agg_suffix = " ON 1=1\n" ons.append(edge_alias + "." + edge_name + " = "+ value) groupby += groupby_prefix + edge_alias + "." + edge_name groupby_prefix = ",\n" agg += agg_prefix + quote_table(self.name) + " a ON "+" AND ".join(ons) where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where+groupby
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = FlatList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges)] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def _groupby_op(self, query): selects = [] for s in listwrap(query.select): if s.value=="." and s.aggregate=="count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate]+"("+jx_expression(s.value).to_sql() + ") AS " + quote_table(s.name)) for w in query.window: selects.append(self._window_op(self, query, w)) agg = " FROM " + quote_table(self.name) + " a\n" groupby = "" groupby_prefix = " GROUP BY " for i, e in enumerate(query.edges): value = e.to_sql() groupby += groupby_prefix + value groupby_prefix = ",\n" where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where+groupby
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): canonical = select = Data(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce( select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(unwrap(frum), "_normalize_select"): return frum._normalize_select(canonical) output = [] if not select.value or select.value == ".": output.extend([ set_default({ "name": c.name, "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() ]) elif isinstance(select.value, basestring): if select.value.endswith(".*"): base_name = select.value[:-2] canonical.name = coalesce(select.name, base_name, select.aggregate) value = jx_expression(select[:-2]) if not isinstance(value, Variable): Log.error("`*` over general expression not supported yet") output.append([ set_default( { "name": base_name, "value": LeavesOp("leaves", value), "format": "dict" # MARKUP FOR DECODING }, canonical) for c in frum.get_columns() if c.type not in STRUCT ]) else: output.extend([ set_default( { "name": base_name + "." + literal_field(c.name[len(base_name) + 1:]), "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() if c.name.startswith(base_name + ".") ]) else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def DataClass(name, columns, constraint=True): """ Use the DataClass to define a class, but with some extra features: 1. restrict the datatype of property 2. restrict if `required`, or if `nulls` are allowed 3. generic constraints on object properties It is expected that this class become a real class (or be removed) in the long term because it is expensive to use and should only be good for verifying program correctness, not user input. :param name: Name of the class we are creating :param columns: Each columns[i] has properties { "name", - (required) name of the property "required", - False if it must be defined (even if None) "nulls", - True if property can be None, or missing "default", - A default value, if none is provided "type" - a Python datatype } :param constraint: a JSON query Expression for extra constraints :return: The class that has been created """ columns = wrap([{"name": c, "required": True, "nulls": False, "type": object} if isinstance(c, basestring) else c for c in columns]) slots = columns.name required = wrap(filter(lambda c: c.required and not c.nulls and not c.default, columns)).name nulls = wrap(filter(lambda c: c.nulls, columns)).name defaults = {c.name: coalesce(c.default, None) for c in columns} types = {c.name: coalesce(c.type, object) for c in columns} code = expand_template( """ from __future__ import unicode_literals from collections import Mapping meta = None types_ = {{types}} defaults_ = {{defaults}} class {{class_name}}(Mapping): __slots__ = {{slots}} def _constraint(row, rownum, rows): return {{constraint_expr}} def __init__(self, **kwargs): if not kwargs: return for s in {{slots}}: object.__setattr__(self, s, kwargs.get(s, {{defaults}}.get(s, None))) missed = {{required}}-set(kwargs.keys()) if missed: Log.error("Expecting properties {"+"{missed}}", missed=missed) illegal = set(kwargs.keys())-set({{slots}}) if illegal: Log.error("{"+"{names}} are not a valid properties", names=illegal) if not self._constraint(0, [self]): Log.error("constraint not satisfied {"+"{expect}}\\n{"+"{value|indent}}", expect={{constraint}}, value=self) def __getitem__(self, item): return getattr(self, item) def __setitem__(self, item, value): setattr(self, item, value) return self def __setattr__(self, item, value): if item not in {{slots}}: Log.error("{"+"{item|quote}} not valid attribute", item=item) object.__setattr__(self, item, value) if not self._constraint(0, [self]): Log.error("constraint not satisfied {"+"{expect}}\\n{"+"{value|indent}}", expect={{constraint}}, value=self) def __getattr__(self, item): Log.error("{"+"{item|quote}} not valid attribute", item=item) def __hash__(self): return object.__hash__(self) def __eq__(self, other): if isinstance(other, {{class_name}}) and dict(self)==dict(other) and self is not other: Log.error("expecting to be same object") return self is other def __dict__(self): return {k: getattr(self, k) for k in {{slots}}} def items(self): return ((k, getattr(self, k)) for k in {{slots}}) def __copy__(self): _set = object.__setattr__ output = object.__new__({{class_name}}) {{assign}} return output def __iter__(self): return {{slots}}.__iter__() def __len__(self): return {{len_slots}} def __str__(self): return str({{dict}}) temp = {{class_name}} """, { "class_name": name, "slots": "(" + (", ".join(convert.value2quote(s) for s in slots)) + ")", "required": "{" + (", ".join(convert.value2quote(s) for s in required)) + "}", "nulls": "{" + (", ".join(convert.value2quote(s) for s in nulls)) + "}", "defaults": jx_expression({"literal": defaults}).to_python(), "len_slots": len(slots), "dict": "{" + (", ".join(convert.value2quote(s) + ": self." + s for s in slots)) + "}", "assign": "; ".join("_set(output, "+convert.value2quote(s)+", self."+s+")" for s in slots), "types": "{" + (",".join(convert.string2quote(k) + ": " + v.__name__ for k, v in types.items())) + "}", "constraint_expr": jx_expression(constraint).to_python(), "constraint": convert.value2json(constraint) } ) return _exec(code, name)
def _normalize_edge(edge, dim_index, schema=None): """ :param edge: Not normalized edge :param dim_index: Dimensions are ordered; this is this edge's index into that order :param schema: for context :return: a normalized edge """ if not _Column: _late_import() if edge == None: Log.error("Edge has no value, or expression is empty") elif isinstance(edge, basestring): if schema: try: e = schema[edge] except Exception: e = None e = unwraplist(e) if e and not isinstance(e, (_Column, set, list)): if isinstance(e, _Column): return [Data( name=edge, value=jx_expression(edge), allowNulls=True, dim=dim_index, domain=_normalize_domain(domain=e, schema=schema) )] elif isinstance(e.fields, list) and len(e.fields) == 1: return [Data( name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, dim=dim_index, domain=e.getDomain() )] else: return [Data( name=e.name, allowNulls=True, dim=dim_index, domain=e.getDomain() )] return [Data( name=edge, value=jx_expression(edge), allowNulls=True, dim=dim_index, domain=_normalize_domain(schema=schema) )] else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Data(fields=edge.value) return [Data( name=edge.name, value=jx_expression(edge.value), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain )] domain = _normalize_domain(edge.domain, schema=schema) return [Data( name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain )]
allowNulls=True, domain=_normalize_domain(schema=schema) ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, value=jx_expression(edge.value), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain ) domain = _normalize_domain(edge.domain, schema=schema) return Dict( name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain )
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_column_map = {c.name: c.es_column for c in frum._columns} es_query = Dict() new_select = Dict() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: # TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: # TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error( 'Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate ) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_" + literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0", } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = jx_expression(s.value).map(es_column_map) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0", } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum, map_=es_column_map) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: # TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({"aggs": {"_nested": set_default({"nested": {"path": frum.query_path}}, es_query)}}) else: if any(split_where[1::]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: # TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total ) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def test_in(self): where = {"in": {"a": [1, 2]}} result = simplify_esfilter(jx_expression(where).to_esfilter()) self.assertEqual(result, {"terms": {"a": [1, 2]}})
def _normalize_where(where, schema=None): if where == None: return TrueOp() return jx_expression(where)
def test_in_map(self): where = {"in": {"a": [1, 2]}} result = jx_expression(where).map({"a": "c"}).__data__() self.assertEqual(result, {"in": {"c": [1, 2]}})
def _normalize_edge(edge, dim_index, schema=None): """ :param edge: Not normalized edge :param dim_index: Dimensions are ordered; this is this edge's index into that order :param schema: for context :return: a normalized edge """ if not _Column: _late_import() if edge == None: Log.error("Edge has no value, or expression is empty") elif isinstance(edge, basestring): if schema: try: e = schema[edge] except Exception: e = None e = unwraplist(e) if e and not isinstance(e, (_Column, set, list)): if isinstance(e, _Column): return [ Data(name=edge, value=jx_expression(edge), allowNulls=True, dim=dim_index, domain=_normalize_domain(domain=e, schema=schema)) ] elif isinstance(e.fields, list) and len(e.fields) == 1: return [ Data(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, dim=dim_index, domain=e.getDomain()) ] else: return [ Data(name=e.name, allowNulls=True, dim=dim_index, domain=e.getDomain()) ] return [ Data(name=edge, value=jx_expression(edge), allowNulls=True, dim=dim_index, domain=_normalize_domain(schema=schema)) ] else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Data(fields=edge.value) return [ Data(name=edge.name, value=jx_expression(edge.value), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain) ] domain = _normalize_domain(edge.domain, schema=schema) return [ Data(name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain) ]
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions) == 0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all( desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value) - {None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command["clear"])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len(c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in ["nested", "object"] } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column( name=new_column_name, type=ctype, table=self.name, es_index=self.name, es_column=typed_column(new_column_name, ctype) ) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = join_field(split_field(self.name)+split_field(nested_column_name)) nested_table = self.nested_tables[nested_table_name] self_primary_key = ",".join(quote_table(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX+"id"+unicode(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = "DELETE FROM " + quote_table(nested_table.name) + \ "\nWHERE EXISTS (" + \ "\nSELECT 1 " + \ "\nFROM " + quote_table(nested_table.name) + " n" + \ "\nJOIN (" + \ "\nSELECT " + self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + where_sql + \ "\n) t ON " + \ " AND ".join( "t." + quote_table(c.es_column) + " = n." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + \ ")" self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Dict(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_table(nested_table.name) + \ "(" + \ self_primary_key + "," + \ _quote_column(extra_key) + "," + \ ",".join( quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + ")" # BUILD THE PARENT TABLES parent = "\nSELECT " + \ self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + jx_expression(command.where).to_sql() # BUILD THE RECORDS children = " UNION ALL ".join( "\nSELECT " + quote_value(i) + " " +quote_table(extra_key.es_column) + "," + ",".join( quote_value(row[c.name]) + " " + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) for i, row in enumerate(doc_collection.get(".", Null).rows) ) sql_command = prefix + \ "\nSELECT " + \ ",".join( "p." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + "," + \ "c." + _quote_column(extra_key) + "," + \ ",".join( "c." + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + \ "\nFROM (" + parent + ") p " + \ "\nJOIN (" + children + \ "\n) c on 1=1" self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column( name=c.name, type=c.type, table=self.name, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name]+listwrap(c.nested_path) ) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [c.type for c in self.columns[c.name]]: self.columns[column.name].add(column) command = "UPDATE " + quote_table(self.name) + " SET " + \ ",\n".join( [ _quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and not c.nested_path ] + [ _quote_column(c) + "=NULL" for k in listwrap(command["clear"]) if k in self.columns for c in self.columns[k] if c.type != "nested" and not c.nested_path ] ) + \ " WHERE " + where_sql self.db.execute(command)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter( jx_expression(query.where).to_esfilter()) } }, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where.to_esfilter()) } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Dict(meta={"esquery": FromES}, data=cube)
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) }}, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where.to_esfilter()) }}, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data( meta={"esquery": FromES}, data=cube )
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): canonical = select = Dict(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce(select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(frum, "_normalize_select"): return frum._normalize_select(canonical) output = [] if not select.value or select.value == ".": output.extend( [set_default({"name": c.name, "value": jx_expression(c.name)}, canonical) for c in frum.get_leaves()] ) elif isinstance(select.value, basestring): if select.value.endswith(".*"): base_name = select.value[:-2] canonical.name = coalesce(select.name, base_name, select.aggregate) value = jx_expression(select[:-2]) if not isinstance(value, Variable): Log.error("`*` over general expression not supported yet") output.append( [ set_default( { "name": base_name, "value": LeavesOp("leaves", value), "format": "dict", # MARKUP FOR DECODING }, canonical, ) for c in frum.get_columns() if c.type not in ["object", "nested"] ] ) else: output.extend( [ set_default( { "name": base_name + "." + literal_field(c.name[len(base_name) + 1 :]), "value": jx_expression(c.name), }, canonical, ) for c in frum.get_leaves() if c.name.startswith(base_name + ".") ] ) else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output