def _select(template, data, fields, depth): output = FlatList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if d.__class__ is Data: Log.error("programmer error, _select can not handle Data, only dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
class _Stats(WindowFunction): """ TRACK STATS, BUT IGNORE OUTLIERS """ def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = FlatList() def add(self, value): if value == None: return self.samples.append(value) def sub(self, value): if value == None: return self.samples.remove(value) def merge(self, agg): Log.error("Do not know how to handle") def end(self): ignore = mo_math.ceiling(len(self.samples) * (1 - self.middle) / 2) if ignore * 2 >= len(self.samples): return stats.Stats() output = stats.Stats(samples=sorted(self.samples)[ignore:len(self.samples) - ignore:]) output.samples = list(self.samples) return output
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
class DefaultDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "limit", "sort"] def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = FlatList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') self.sort = 1 def compare(self, a, b): return value_compare(a.value, b.value) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getPartByKey(self, key): canonical = self.map.get(key) if canonical: return canonical canonical = Data(name=key, value=key) self.partitions.append(canonical) self.map[key] = canonical return canonical def getIndexByKey(self, key): canonical = self.map.get(key) if canonical: return canonical.dataIndex index = len(self.partitions) canonical = Data(name=key, value=key, dataIndex=index) self.partitions.append(canonical) self.map[key] = canonical return index def getKey(self, part): return part.value def getEnd(self, part): return part.value def getLabel(self, part): return part.value def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions output.limit = self.limit return output
def select(self, fields): if isinstance(fields, Mapping): fields=fields.value if isinstance(fields, text_type): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def more(): output = FlatList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum.schema output = FlatList() if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in wrap(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema.leaves(v.var): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() try: vars_ |= edge.value.vars() depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v.var)) if -1 in depths: Log.error( "Do not know of column {{column}}", column=unwraplist([v for v in vars_ if schema[v] == None]) ) if len(depths) > 1: Log.error("expression {{expr|quote}} spans tables, can not handle", expr=edge.value) max_depth = MAX(depths) while len(output) <= max_depth: output.append([]) except Exception as e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) output[max_depth].append(AggsDecoder(edge, query, limit)) return output
def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = FlatList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') self.sort = 1
def not_right(self, num): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE LEFT [:-num:] """ self._convert() if num == None: return FlatList([self.list[:-1:]]) if num <= 0: return Null return FlatList(self.list[:-num:])
def right(self, num=None): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE RIGHT """ self._convert() if num == None: return FlatList([self.list[-1]]) if num <= 0: return Null return FlatList(self.list[-num])
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, text_type)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and is_container(desc.key): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and is_data(desc.partitions[0][desc.key]): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name")
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if is_data(where): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not is_container(v): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if is_text(edge): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if is_data(fields): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif is_list(fields) and len(fields) == 1 and is_variable_name(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output} elif where["or"]: return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]} elif where["and"]: return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]} elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def datawrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Data() _set(m, SLOT, v) # INJECT m.__dict__=v SO THERE IS NO COPY return m elif type_ is Data: return v elif type_ is DataObject: return v elif type_ is none_type: return None # So we allow `is None` elif type_ is list: return FlatList(v) elif type_ in generator_types: return (wrap(vv) for vv in v) elif isinstance(v, (text_type, binary_type, int, float, Decimal, datetime, date, Data, FlatList, NullType, none_type)): return v elif isinstance(v, Mapping): return DataObject(v) elif hasattr(v, "__data__"): return v.__data__() else: return DataObject(v)
def groupby_size(data, size): if hasattr(data, "next"): iterator = data elif hasattr(data, "__iter__"): iterator = data.__iter__() else: Log.error("do not know how to handle this type") done = FlatList() def more(): output = FlatList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output # THIS IS LAZY i = 0 while True: output = more() yield (i, output) if len(done) > 0: break i += 1
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if isinstance(fieldnames, int): funcs = [(lambda t: t[fieldnames], 1)] else: if not fieldnames: return wrap(sort_using_cmp(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(get(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception as e: Log.error("problem with compare", e) return 0 if is_list(data): output = FlatList( [unwrap(d) for d in sort_using_cmp(data, cmp=comparer)]) elif is_text(data): Log.error("Do not know how to handle") elif hasattr(data, "__iter__"): output = FlatList( [unwrap(d) for d in sort_using_cmp(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output except Exception as e: Log.error("Problem sorting\n{{data}}", data=data, cause=e)
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if not is_text(template): # sys.stderr.write(str("Log.error was expecting a unicode template")) Log.error("Log.error was expecting a unicode template") if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = Data(dict(default_params, **more_params)) add_to_trace = False if cause == None: causes = None elif is_list(cause): causes = [] for c in listwrap( cause ): # CAN NOT USE LIST-COMPREHENSION IN PYTHON3 (EXTRA STACK DEPTH FROM THE IN-LINED GENERATOR) causes.append(Except.wrap(c, stack_depth=1)) causes = FlatList(causes) elif isinstance(cause, BaseException): causes = Except.wrap(cause, stack_depth=1) else: causes = None Log.error("can only accept Exception, or list of exceptions") trace = exceptions.get_stacktrace(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except( context=exceptions.ERROR, template=template, params=params, cause=causes, trace=trace, ) raise_from_none(e)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self.es.get_properties() # GET IDS OF DOCUMENTS results = self.es.search({ "fields": listwrap(schema._routing.path), "query": {"filtered": { "filter": jx_expression(command.where).to_esfilter(Null) }}, "size": 10000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_variable_name(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: v = scrub(v) scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(value2json(c) for c in updates) + "\n") response = self.es.cluster.post( self.es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"wait_for_active_shards": self.settings.wait_for_active_shards} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def _iter(): g = 0 out = FlatList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = FlatList() if out: yield g, out except Exception as e: e = Except.wrap(e) if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside jx.groupby", e)
def _tuple(template, data, fields, depth, output): deep_path = None deep_fields = FlatList() for d in data: record = template for f in fields: index, children, record = _tuple_deep(d, f, depth, record) if index: path = f.value[0:index:] deep_fields.append(f) if deep_path and path != deep_path: Log.error("Dangerous to select into more than one branch at time") if not children: output.append(record) else: _tuple(record, children, deep_fields, depth + 1, output) return output
def _iter(): g = 0 out = [] try: for i, d in enumerate(data): out.append(d) if (i + 1) % size == 0: yield g, FlatList(vals=out) g += 1 out = [] if out: yield g, FlatList(vals=out) except Exception as e: e = Except.wrap(e) if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside jx.chunk", e)
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = transpose(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def select(data, field_name): """ return list with values from field_name """ if isinstance(data, Cube): return data._select(_normalize_selects(field_name)) if isinstance(data, PartFlatList): return data.select(field_name) if isinstance(data, UniqueIndex): data = ( data._data.values() ) # THE SELECT ROUTINE REQUIRES dicts, NOT Data WHILE ITERATING if is_data(data): return select_one(data, field_name) if is_data(field_name): field_name = wrap(field_name) if field_name.value in ["*", "."]: return data if field_name.value: # SIMPLIFY {"value":value} AS STRING field_name = field_name.value # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): path = split_field(field_name) if len(path) == 1: return FlatList([d[field_name] for d in data]) else: output = FlatList() flat_list._select1(data, path, 0, output) return output elif is_list(field_name): keys = [_select_a_field(wrap(f)) for f in field_name] return _select(Data(), unwrap(data), keys, 0) else: keys = [_select_a_field(field_name)] return _select(Data(), unwrap(data), keys, 0)
def remove_instances(self, net_new_utility): instances = self.running_instances() # FIND COMBO THAT WILL SHUTDOWN WHAT WE NEED EXACTLY, OR MORE remove_list = [] for acceptable_error in range(0, 8): remaining_utility = -net_new_utility remove_list = FlatList() for s in instances: utility = coalesce(s.markup.type.utility, 0) if utility <= remaining_utility + acceptable_error: remove_list.append(s) remaining_utility -= utility if remaining_utility <= 0: net_new_utility = -remaining_utility break if not remove_list: return net_new_utility # SEND SHUTDOWN TO EACH INSTANCE Log.note("Shutdown {{instances}}", instances=remove_list.id) remove_threads = [ Thread.run("teardown for " + text(i.id), self.instance_manager.teardown, i) for i in remove_list ] for t in remove_threads: try: t.join() except Exception as e: Log.warning("Teardown of {{id}} failed", id=i.id, cause=e) remove_spot_requests = remove_list.spot_instance_request_id # TERMINATE INSTANCES self.ec2_conn.terminate_instances(instance_ids=remove_list.id) # TERMINATE SPOT REQUESTS self.ec2_conn.cancel_spot_instance_requests( request_ids=remove_spot_requests) return net_new_utility
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum.schema output = FlatList() if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in wrap(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema.leaves(v, meta=True): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() try: vars_ |= edge.value.vars() depths = set( len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v)) if -1 in depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v] == None])) if len(depths) > 1: Log.error( "expression {{expr|quote}} spans tables, can not handle", expr=edge.value) max_depth = MAX(depths) while len(output) <= max_depth: output.append([]) except Exception as e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) output[max_depth].append(AggsDecoder(edge, query, limit)) return output
def _select(template, data, fields, depth): output = FlatList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if d.__class__ is Data: Log.error( "programmer error, _select can not handle Data, only dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add( f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error( "Dangerous to select into more than one branch at time" ) if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, text_type)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and is_container(desc.key): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and is_data(desc.partitions[0][desc.key]): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name")
def verify_allowance(self, user, resource): """ VERIFY IF user CAN ACCESS resource :param user: :param resource: :return: ALLOWANCE CHAIN """ user = wrap(user) resource = wrap(resource) resources = self.db.query( sql_query({ "select": ["resource", "owner"], "from": PERMISSION_TABLE, "where": { "eq": { "user": user._id } }, })) for r in resources.data: record = Data(zip(resources.header, r)) if record.resource == resource._id: if record.owner == ROOT_USER._id: return FlatList(vals=[{ "resource": resource, "user": user, "owner": ROOT_USER }]) else: cascade = self.verify_allowance( wrap({"_id": record.owner}), resource) if cascade: cascade.append({ "resource": resource, "user": user, "owner": record.owner }) return cascade else: group = record.resource cascade = self.verify_allowance(wrap({"_id": group}), resource) if cascade: cascade.append({ "group": group, "user": user, "owner": record.owner }) return cascade return []
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception, e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = FlatList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = FlatList( [unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output
def datawrap(v): type_ = _get(v, CLASS) if type_ is dict: m = Data() _set(m, SLOT, v) # INJECT m.__dict__=v SO THERE IS NO COPY return m elif type_ is tuple: return FlatList(v) elif type_ is list: return FlatList(v) elif type_ in (Data, DataObject, none_type, FlatList, text, binary_type, int, float, Decimal, datetime, date, NullType, none_type): return v elif type_ in generator_types: return (wrap(vv) for vv in v) elif isinstance(v, (text, binary_type, int, float, Decimal, datetime, date, FlatList, NullType, Mapping, none_type)): return v elif hasattr(v, "__data__"): return v.__data__() else: return DataObject(v)
def remove_instances(self, net_new_utility): instances = self.running_instances() # FIND COMBO THAT WILL SHUTDOWN WHAT WE NEED EXACTLY, OR MORE remove_list = [] for acceptable_error in range(0, 8): remaining_utility = -net_new_utility remove_list = FlatList() for s in instances: utility = coalesce(s.markup.type.utility, 0) if utility <= remaining_utility + acceptable_error: remove_list.append(s) remaining_utility -= utility if remaining_utility <= 0: net_new_utility = -remaining_utility break if not remove_list: return net_new_utility # SEND SHUTDOWN TO EACH INSTANCE Log.note("Shutdown {{instances}}", instances=remove_list.id) for i in remove_list: try: self.instance_manager.teardown(i) except Exception as e: Log.warning("Teardown of {{id}} failed", id=i.id, cause=e) remove_spot_requests = remove_list.spot_instance_request_id # TERMINATE INSTANCES self.ec2_conn.terminate_instances(instance_ids=remove_list.id) # TERMINATE SPOT REQUESTS self.ec2_conn.cancel_spot_instance_requests(request_ids=remove_spot_requests) return net_new_utility
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return Null output = FlatList() for s in listwrap(sort): if is_text(s) or mo_math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort==None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return Null output = FlatList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort==None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def addParts(parentPart, childPath, count, index): """ BUILD A hierarchy BY REPEATEDLY CALLING self METHOD WITH VARIOUS childPaths count IS THE NUMBER FOUND FOR self PATH """ if index == None: index = 0 if index == len(childPath): return c = childPath[index] parentPart.count = coalesce(parentPart.count, 0) + count if parentPart.partitions == None: parentPart.partitions = FlatList() for i, part in enumerate(parentPart.partitions): if part.name == c.name: addParts(part, childPath, count, index + 1) return parentPart.partitions.append(c) addParts(c, childPath, count, index + 1)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self.es.get_properties() # GET IDS OF DOCUMENTS results = self.es.search({ "fields": ["_id"], "query": {"filtered": { "filter": jx_expression(command.where).to_es14_filter(Null) }}, "size": 10000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_variable_name(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: v = scrub(v) scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es14_script(schema).script(schema)}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id}}) updates.append(s) content = ("\n".join(value2json(c) for c in updates) + "\n") response = self.es.cluster.post( self.es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"wait_for_active_shards": self.settings.wait_for_active_shards} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def __getitem__(self, index): self._convert() if isinstance(index, slice): # IMPLEMENT FLAT SLICES (for i not in range(0, len(self)): assert self[i]==None) if index.step is not None: from mo_logs import Log Log.error( "slice step must be None, do not know how to deal with values" ) length = len(self.list) i = index.start i = min(max(i, 0), length) j = index.stop if j is None: j = length else: j = max(min(j, length), 0) return FlatList(self.list[i:j]) if index < 0 or len(self.list) <= index: return Null return wrap(self.list[index])
def get_selects(query): schema = query.frum.schema split_select = {".": ESSelect(".")} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var) ) if c.jx_type == NESTED: get_select(".").set_op = True new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, "pull": get_pull_source(c.es_column), } ) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, } ) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select(".").set_op = True new_select.append( { "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source("."), } ) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select(".").set_op = True for c in leaves: if ( len(c.nested_path) == 1 ): # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": lambda row: row._id, } ) elif c.jx_type == NESTED: get_select(".").set_op = True pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, } ) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0]) ), s_column, ) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field(s_column, unnest_path(c_nested_path)) ), ) new_select.append( { "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child, }, "pull": pull, } ) else: new_select.append( { "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 else: split_scripts = split_expression_by_path( select.value, schema, lang=Painless ) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text( Painless[first(script)].partial_eval().to_es_script(schema) ) } new_select.append( { "name": select.name, "pull": jx_expression_to_function( "fields." + literal_field(select.name) ), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select(".").set_op: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var)) ) else: Log.error("Do not know what to do") return new_select, split_select
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = FlatList()
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if is_list(self.fields) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception as e: Log.error("", e) else: Log.error("deeper than 2 is not supported yet") return Domain( type=self.type, name=self.name, partitions=wrap(partitions), min=self.min, max=self.max, interval=self.interval, # THE COMPLICATION IS THAT SOMETIMES WE WANT SIMPLE PARTITIONS, LIKE # STRINGS, DATES, OR NUMBERS. OTHER TIMES WE WANT PARTITION OBJECTS # WITH NAME, VALUE, AND OTHER MARKUP. # USUALLY A "set" IS MEANT TO BE SIMPLE, BUT THE end() FUNCTION IS # OVERRIDES EVERYTHING AND IS EXPLICIT. - NOT A GOOD SOLUTION BECAUSE # end() IS USED BOTH TO INDICATE THE QUERY PARTITIONS *AND* DISPLAY # COORDINATES ON CHARTS # PLEASE SPLIT end() INTO value() (replacing the string value) AND # label() (for presentation) value="name" if not self.value and self.partitions else self.value, key="value", label=coalesce(self.label, (self.type == "set" and self.name)), end=coalesce(self.end, (self.type == "set" and self.name)), isFacet=self.isFacet, dimension=self )
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, text_type)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and is_container(desc.key): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and is_data(desc.partitions[0][desc.key]): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception as e: Log.error("problem", e) def getPartByKey(self, key): try: canonical = self.map.get(key, None) if not canonical: return self.NULL return canonical except Exception as e: Log.error("problem", e) def getKey(self, part): return part[self.key] def getKeyByIndex(self, index): return self.partitions[index][self.key] def getEnd(self, part): if self.value: return part[self.value] else: return part def getLabel(self, part): return part[self.label] def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions return output
def decode(json): """ THIS IS CURRENTLY 50% SLOWER THAN PyPy DEFAULT IMPLEMENTATION THE INTENT IS TO NEVER ACTUALLY PARSE ARRAYS OF PRIMITIVE VALUES, RATHER FIND THE START AND END OF THOSE ARRAYS AND SIMPLY STRING COPY THEM TO THE INEVITABLE JSON OUTPUT """ var = "" curr = FlatList() mode = ARRAY stack = FlatList() # FIRST PASS SIMPLY GETS STRUCTURE i = 0 while i < len(json): c = json[i] i += 1 if mode == ARRAY: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == "]": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr.append(arr) curr = arr mode = ARRAY else: curr.append(arr) elif c == "{": obj = {} stack.append(curr) curr.append(obj) curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr.children.append(val) else: i, val = parse_const(i, json) elif mode == OBJECT: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == ":": mode = VALUE elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "\"": i, var = fast_parse_string(i, json) elif mode == VALUE: if c in [" ", "\t", "\n", "\r"]: pass elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr[var] = arr curr = arr mode = ARRAY else: curr[var] = arr mode = OBJECT elif c == "{": obj = {} stack.append(curr) curr[var] = obj curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr[var] = val mode = OBJECT else: i, val = parse_const(i, json) curr[var] = val mode = OBJECT return curr[0]
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = FlatList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if is_data(dimension.fields): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_variable_name(dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_variable_name(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_variable_name(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif is_data(v): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS more_filter = { "and": [ es_filters[0], {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}} ] } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."}, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": {"name": s.name, "index": i, "child": "."}, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es_post( es, Data( query={"filtered": {"filter": more_filter}}, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE: return True if filter is FALSE: return False filter = wrap(filter) if filter["and"]: result = True output = FlatList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if is_text(filter.missing): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if is_text(filter["exists"]): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter})
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort==None: return FlatList.EMPTY output = FlatList() for s in listwrap(sort): if is_text(s): output.append({"value": jx_expression(s), "sort": 1}) elif is_expression(s): output.append({"value": s, "sort": 1}) elif mo_math.is_integer(s): output.append({"value": jx_expression({"offset": s}), "sort": 1}) elif not s.sort and not s.value and all(d in sort_direction for d in s.values()): for v, d in s.items(): output.append({"value": jx_expression(v), "sort": sort_direction[d]}) elif not s.sort and not s.value: Log.error("`sort` clause must have a `value` property") else: output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": sort_direction[s.sort]}) return output
class SimpleSetDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = [ "NULL", # THE value FOR NULL "partitions", # LIST OF {name, value, dataIndex} dicts "map", # MAP FROM value TO name "order" # MAP FROM value TO dataIndex ] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (text_type, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i if isinstance(p, (int, float)): text_part = text_type(float(p)) # ES CAN NOT HANDLE NUMERIC PARTS self.map[text_part] = part self.order[text_part] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and is_container(desc.key): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and is_data(desc.partitions[0][desc.key]): # LOOKS LIKE OBJECTS # sorted = desc.partitions[desc.key] self.key = desc.key self.map = UniqueIndex(keys=desc.key) self.order = {p[self.key]: p.dataIndex for p in desc.partitions} self.partitions = desc.partitions elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception as e: Log.error("problem", e) def getPartByKey(self, key): try: canonical = self.map.get(key) if not canonical: return self.NULL return canonical except Exception as e: Log.error("problem", e) def getPartByIndex(self, index): return self.partitions[index] def getKeyByIndex(self, index): if index < 0 or index >= len(self.partitions): return None return self.partitions[index][self.key] def getKey(self, part): return part[self.key] def getEnd(self, part): if self.value: return part[self.value] else: return part def getLabel(self, part): return part[self.label] def __data__(self): output = Domain.__data__(self) output.partitions = self.partitions return output
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (text_type, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i if isinstance(p, (int, float)): text_part = text_type(float(p)) # ES CAN NOT HANDLE NUMERIC PARTS self.map[text_part] = part self.order[text_part] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and is_container(desc.key): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and is_data(desc.partitions[0][desc.key]): # LOOKS LIKE OBJECTS # sorted = desc.partitions[desc.key] self.key = desc.key self.map = UniqueIndex(keys=desc.key) self.order = {p[self.key]: p.dataIndex for p in desc.partitions} self.partitions = desc.partitions elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif is_data(fields): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not is_list(a): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif is_data(fields): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if is_data(fields): output = Data() for e, v in transpose(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def select(self, fields): if isinstance(fields, Mapping): fields = fields.value if isinstance(fields, text_type): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def es_deepop(es, query): schema = query.frum.schema columns = schema.columns query_path = schema.query_path map_to_local = {k: get_pull(c[0]) for k, c in schema.lookup.items()} # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.type not in STRUCT and c.es_column != "_id": if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": c.names[query_path], "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": literal_field(c.names[query_path]), "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = set(c.names[query_path] for c in columns) for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.name.lstrip(".") n.put.name = literal_field(n.name) col_names.add(n.name) else: prefix = schema[s.value.term.var][0].names["."] + "." prefix_length = len(prefix) for c in columns: cname = c.names["."] if cname.startswith(prefix) and c.type not in STRUCT: pull = get_pull(c) if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + cname[prefix_length:], "pull": pull, "nested_path": c.nested_path[0], "put": { "name": s.name + "." + literal_field(cname[prefix_length:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.type not in STRUCT and c.es_column != "_id": if len(c.nested_path) == 1: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": ".", "index": i, "child": c.es_column} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: prefix = schema[s.value.var][0] if not prefix: net_columns = [] else: parent = prefix.es_column+"." prefix_length = len(parent) net_columns = [c for c in columns if c.es_column.startswith(parent) and c.type not in STRUCT] if not net_columns: pull = get_pull(prefix) if len(prefix.nested_path) == 1: es_query.fields += [prefix.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": prefix.nested_path[0], "put": {"name": s.name, "index": i, "child": "."} }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if len(n.nested_path) == 1: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": {"name": s.name, "index": i, "child": n.es_column[prefix_length:]} }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Data( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)