def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE: return data if isinstance(data, Container): return data.filter(where) if is_container(data): temp = jx_expression_to_function(where) dd = wrap(data) return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)]) else: Log.error( "Do not know how to handle type {{type}}", type=data.__class__.__name__ ) try: return drill_filter(where, data) except Exception as _: # WOW! THIS IS INEFFICIENT! return wrap( [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])] )
def _send_email(self): try: if not self.accumulation: return with Closer(connect_to_region( self.settings.region, aws_access_key_id=unwrap(self.settings.aws_access_key_id), aws_secret_access_key=unwrap(self.settings.aws_secret_access_key) )) as conn: # WHO ARE WE SENDING TO emails = Data() for template, params in self.accumulation: content = expand_template(template, params) emails[literal_field(self.settings.to_address)] += [content] for c in self.cc: if any(c in params.params.error for c in c.contains): emails[literal_field(c.to_address)] += [content] # SEND TO EACH for to_address, content in emails.items(): conn.send_email( source=self.settings.from_address, to_addresses=listwrap(to_address), subject=self.settings.subject, body="\n\n".join(content), format="text" ) self.next_send = Date.now() + self.settings.max_interval self.accumulation = [] except Exception as e: self.next_send = Date.now() + self.settings.max_interval Log.warning("Could not send", e)
def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict( region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap(kwargs.aws.aws_secret_access_key) ) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None self.price_lookup = None self.done_spot_requests = Signal() self.net_new_locker = Lock() self.net_new_spot_requests = UniqueIndex(("id",)) # SPOT REQUESTS FOR THIS SESSION self.watcher = None self.active = None self.settings.uptime.bid_percentile = coalesce(self.settings.uptime.bid_percentile, self.settings.bid_percentile) self.settings.uptime.history = coalesce(Date(self.settings.uptime.history), DAY) self.settings.uptime.duration = coalesce(Duration(self.settings.uptime.duration), Date("5minute")) self.settings.max_percent_per_type = coalesce(self.settings.max_percent_per_type, 1) if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required(): self._start_life_cycle_watcher() if not disable_prices: self.pricing()
def aggs_iterator(aggs, decoders, coord=True): """ DIG INTO ES'S RECURSIVE aggs DATA-STRUCTURE: RETURN AN ITERATOR OVER THE EFFECTIVE ROWS OF THE RESULTS :param aggs: ES AGGREGATE OBJECT :param decoders: :param coord: TURN ON LOCAL COORDINATE LOOKUP """ depth = max(d.start + d.num_columns for d in decoders) def _aggs_iterator(agg, d): agg = drill(agg) if d > 0: for k, v in agg.items(): if k == "_match": v = drill(v) for i, b in enumerate(v.get("buckets", EMPTY_LIST)): b["_index"] = i for a, parts in _aggs_iterator(b, d - 1): yield a, parts + (b,) elif k == "_other": for b in v.get("buckets", EMPTY_LIST): for a, parts in _aggs_iterator(b, d - 1): yield a, parts + (Null,) elif k == "_missing": b = drill(v) for a, parts in _aggs_iterator(b, d - 1): yield a, parts + (b,) elif k.startswith("_join_"): v["key"] = int(k[6:]) for a, parts in _aggs_iterator(v, d - 1): yield a, parts + (v,) else: for k, v in agg.items(): if k == "_match": v = drill(v) for i, b in enumerate(v.get("buckets", EMPTY_LIST)): b["_index"] = i yield b, (b,) elif k == "_other": for b in v.get("buckets", EMPTY_LIST): yield b, (Null,) elif k == "_missing": b = drill(v,) yield b, (v,) elif k.startswith("_join_"): v["_index"] = int(k[6:]) yield v, (v,) if coord: for a, parts in _aggs_iterator(unwrap(aggs), depth - 1): coord = tuple(d.get_index(parts) for d in decoders) if any(c is None for c in coord): continue yield parts, coord, a else: for a, parts in _aggs_iterator(unwrap(aggs), depth - 1): yield parts, None, a
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts =listwrap(self.partitions) for i, p in enumerate(parts): self.min = MIN([self.min, p.min]) self.max = MAX([self.max, p.max]) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max and unwrap(p) is not unwrap(q): Log.error("partitions overlap!") self.partitions = wrap(parts) return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def extend(self, records): """ JUST SO WE MODEL A Queue """ records = { v["id"]: v["value"] if "value" in v else mo_json.json2value(v['json']) for v in records } unwrap(self.data).update(records) self.refresh() Log.note("{{num}} documents added", num=len(records))
def extend(self, records): """ JUST SO WE MODEL A Queue """ records = {v["id"]: v["value"] for v in records} unwrap(self.data).update(records) data_as_json = convert.value2json(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} documents added", num= len(records))
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, text_type): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_variable_name(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output} elif where["or"]: return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]} elif where["and"]: return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]} elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def make_log_from_settings(settings): assert settings["class"] # IMPORT MODULE FOR HANDLER path = settings["class"].split(".") class_name = path[-1] path = ".".join(path[:-1]) constructor = None try: temp = __import__(path, globals(), locals(), [class_name], 0) constructor = object.__getattribute__(temp, class_name) except Exception as e: if settings.stream and not constructor: # PROVIDE A DEFAULT STREAM HANLDER constructor = StructuredLogger_usingThreadedStream else: Log.error("Can not find class {{class}}", {"class": path}, cause=e) # IF WE NEED A FILE, MAKE SURE DIRECTORY EXISTS if settings.filename != None: from mo_files import File f = File(settings.filename) if not f.parent.exists: f.parent.create() settings['class'] = None params = unwrap(settings) log_instance = constructor(**params) return log_instance
def __setitem__(self, key, value): if key == "": get_logger().error("key is empty string. Probably a bad idea") if isinstance(key, str): key = key.decode("utf8") d=self try: value = unwrap(value) if key.find(".") == -1: if value is None: dict.pop(d, key, None) else: dict.__setitem__(d, key, value) return self seq = _split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: dict.pop(d, seq[-1], None) else: dict.__setitem__(d, seq[-1], value) return self except Exception as e: raise e
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def dict2Multiset(dic): if dic == None: return None from mo_collections.multiset import Multiset output = Multiset() output.dic = unwrap(dic).copy() return output
def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def unexpected( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause and not isinstance(cause, Except): cause = Except(exceptions.UNEXPECTED, text_type(cause), trace=exceptions._extract_traceback(0)) trace = exceptions.extract_stack(1) e = Except(type=exceptions.UNEXPECTED, template=template, params=params, cause=cause, trace=trace) Log.note( "{{error}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def get(self, key): """ simple `select` """ if not Log: _late_import() return FlatList(vals=[unwrap(coalesce(_datawrap(v), Null)[key]) for v in _get_list(self)])
def __setattr__(self, key, value): d = self._internal_dict value = unwrap(value) if value is None: d = self._internal_dict d.pop(key, None) else: d[key] = value return self
def call(self, proc_name, params): self._execute_backlog() params = [unwrap(v) for v in params] try: self.cursor.callproc(proc_name, params) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem calling procedure " + proc_name, e)
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if isinstance(node, Mapping): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = mo_dots.get_attr(new_value, ref.fragment) DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif isinstance(output, text_type): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif isinstance(node, list): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def reverse(vals): # TODO: Test how to do this fastest l = len(vals) output = [None] * l for v in unwrap(vals): l -= 1 output[l] = v return wrap(output)
def argparse(defs): parser = _argparse.ArgumentParser() for d in listwrap(defs): args = d.copy() name = args.name args.name = None parser.add_argument(*unwrap(listwrap(name)), **args) namespace = parser.parse_args() output = {k: getattr(namespace, k) for k in vars(namespace)} return wrap(output)
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True test = unwrap(test) expected = unwrap(expected) try: if test is None and expected is None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping) and isinstance(test, Mapping): for k, v2 in unwrap(expected).items(): v1 = test.get(k) assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = mo_dots.get_attr(test, literal_field(k)) else: v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, (set, list)) and isinstance(expected, set): test = set(test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected ) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception, _: pass else: Log.error("Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test)
def select(data, field_name): """ return list with values from field_name """ if isinstance(data, Cube): return data._select(_normalize_selects(field_name)) if isinstance(data, PartFlatList): return data.select(field_name) if isinstance(data, UniqueIndex): data = ( data._data.values() ) # THE SELECT ROUTINE REQUIRES dicts, NOT Data WHILE ITERATING if is_data(data): return select_one(data, field_name) if is_data(field_name): field_name = wrap(field_name) if field_name.value in ["*", "."]: return data if field_name.value: # SIMPLIFY {"value":value} AS STRING field_name = field_name.value # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): path = split_field(field_name) if len(path) == 1: return FlatList([d[field_name] for d in data]) else: output = FlatList() flat_list._select1(data, path, 0, output) return output elif is_list(field_name): keys = [_select_a_field(wrap(f)) for f in field_name] return _select(Data(), unwrap(data), keys, 0) else: keys = [_select_a_field(field_name)] return _select(Data(), unwrap(data), keys, 0)
def params_pack(params, *args): settings = {} for a in args: for k, v in a.items(): k = text_type(k) if k in settings: continue settings[k] = v output = {str(k): unwrap(settings[k]) for k in params if k in settings} return output
def __init__(self, name, data, schema=None): # TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION data = list(unwrap(data)) Container.__init__(self) if schema == None: self._schema = get_schema_from_list(name, data) else: self._schema = schema self.name = name self.data = data self.locker = Lock() # JUST IN CASE YOU WANT TO DO MORE THAN ONE THING
def __setitem__(self, i, y): try: _list = _get_list(self) if i <= len(_list): for i in range(len(_list), i): _list.append(None) _list[i] = unwrap(y) except Exception as e: if not Log: _late_import() Log.error("problem", cause=e)
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sort_using_cmp(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception as e: Log.error("problem with compare", e) return 0 if is_list(data): output = FlatList([unwrap(d) for d in sort_using_cmp(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = FlatList( [unwrap(d) for d in sort_using_cmp(list(data), cmp=comparer)] ) else: Log.error("Do not know how to handle") output = None return output except Exception as e: Log.error("Problem sorting\n{{data}}", data=data, cause=e)
def reverse(vals): # TODO: Test how to do this fastest if not hasattr(vals, "len"): vals = list(vals) l = len(vals) output = [None] * l for v in unwrap(vals): l -= 1 output[l] = v return wrap(output)
def argparse(defs): parser = _ArgParser() for d in listwrap(defs): args = d.copy() name = args.name args.name = None parser.add_argument(*unwrap(listwrap(name)), **args) namespace, unknown = parser.parse_known_args() if unknown: Log.warning("Ignoring arguments: {{unknown|json}}", unknown=unknown) output = {k: getattr(namespace, k) for k in vars(namespace)} return wrap(output)
def _iadd(self, other): if not _get(other, CLASS) in data_types: get_logger().error("Expecting a Mapping") d = unwrap(self) for ok, ov in other.items(): sv = d.get(ok) if sv == None: d[ok] = deepcopy(ov) elif isinstance(ov, (Decimal, float, long, int)): if _get(sv, CLASS) in data_types: get_logger().error( "can not add {{stype}} with {{otype}", stype=_get(sv, CLASS).__name__, otype=_get(ov, CLASS).__name__ ) elif is_list(sv): d[ok].append(ov) else: d[ok] = sv + ov elif is_list(ov): d[ok] = listwrap(sv) + ov elif _get(ov, CLASS) in data_types: if _get(sv, CLASS) in data_types: _iadd(sv, ov) elif is_list(sv): d[ok].append(ov) else: get_logger().error( "can not add {{stype}} with {{otype}", stype=_get(sv, CLASS).__name__, otype=_get(ov, CLASS).__name__ ) else: if _get(sv, CLASS) in data_types: get_logger().error( "can not add {{stype}} with {{otype}", stype=_get(sv, CLASS).__name__, otype=_get(ov, CLASS).__name__ ) else: d[ok].append(ov) return self
def _iadd(self, other): if not isinstance(other, Mapping): get_logger().error("Expecting a Mapping") d = unwrap(self) for ok, ov in other.items(): sv = d.get(ok) if sv == None: d[ok] = deepcopy(ov) elif isinstance(ov, (Decimal, float, long, int)): if isinstance(sv, Mapping): get_logger().error( "can not add {{stype}} with {{otype}", stype=sv.__class__.__name__, otype=ov.__class__.__name__ ) elif isinstance(sv, list): d[ok].append(ov) else: d[ok] = sv + ov elif isinstance(ov, list): d[ok] = listwrap(sv) + ov elif isinstance(ov, Mapping): if isinstance(sv, Mapping): _iadd(sv, ov) elif isinstance(sv, list): d[ok].append(ov) else: get_logger().error( "can not add {{stype}} with {{otype}", stype=sv.__class__.__name__, otype=ov.__class__.__name__ ) else: if isinstance(sv, Mapping): get_logger().error( "can not add {{stype}} with {{otype}", stype=sv.__class__.__name__, otype=ov.__class__.__name__ ) else: d[ok].append(ov) return self
def __init__( self, host, user=None, port=None, config=None, gateway=None, forward_agent=None, connect_timeout=None, connect_kwargs=None, inline_ssh_env=None, key_filename=None, # part of connect_kwargs kwargs=None, ): connect_kwargs = set_default( {}, connect_kwargs, {"key_filename": File(key_filename).abspath} ) self.stdout = LogStream(host, "stdout") self.stderr = LogStream(host, "stderr") config = Config(**unwrap(set_default( {}, config, {"overrides": {"run": { # "hide": True, "out_stream": self.stdout, "err_stream": self.stderr, }}}, ))) self.warn = False self.conn = _Connection( host, user, port, config, gateway, forward_agent, connect_timeout, connect_kwargs, inline_ssh_env, )
def __init__( self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None): Container.__init__(self, None) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs self.name = coalesce(name, alias, index) if read_only: self._es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs) else: self._es = elasticsearch.Cluster(kwargs=kwargs).get_index( read_only=read_only, kwargs=kwargs) self.meta = FromESMetadata(kwargs=kwargs) self.settings.type = self._es.settings.type self.edges = Data() self.worker = None columns = self.meta.get_columns( table_name=coalesce(name, alias, index)) self._schema = Schema(coalesce(name, alias, index), columns) if typed == None: # SWITCH ON TYPED MODE self.typed = any(c.es_column.find(".$") != -1 for c in columns) else: self.typed = typed
def drill_filter(esfilter, data): """ PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN TODO: FIX THIS MONUMENALLY BAD IDEA """ esfilter = unwrap(esfilter) primary_nested = [] # track if nested, changes if not primary_column = [] # only one path allowed primary_branch = [ ] # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def leaves(value, prefix=None): """ LIKE items() BUT RECURSIVE, AND ONLY FOR THE LEAVES (non dict) VALUES SEE wrap_leaves FOR THE INVERSE :param value: THE Mapping TO TRAVERSE :param prefix: OPTIONAL PREFIX GIVEN TO EACH KEY :return: Data, WHICH EACH KEY BEING A PATH INTO value TREE """ prefix = coalesce(prefix, "") output = [] for k, v in value.items(): try: if isinstance(v, Mapping): output.extend(leaves(v, prefix=prefix + literal_field(k) + ".")) else: output.append((prefix + literal_field(k), unwrap(v))) except Exception as e: get_logger().error("Do not know how to handle", cause=e) return output
def __init__( self, host, index, type=None, name=None, port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None ): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs self.name = name = coalesce(name, index) if read_only: self.es = elasticsearch.Alias(alias=index, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs) self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.settings.type = self.es.settings.type self.edges = Data() self.worker = None columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema query_path = schema.query_path[0] selects = listwrap(query.select) acc, decoders, es_query = build_es_query(selects, query_path, schema, query) with Timer("ES query time", verbose=DEBUG) as es_duration: result = es.search(es_query) # Log.note("{{result}}", result=result) try: format_time = Timer("formatting", verbose=DEBUG) with format_time: # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE aggs = unwrap(result.aggregations) edges_formatter, groupby_formatter, value_fomratter, mime_type = agg_formatters[ query.format] if query.edges: output = edges_formatter(aggs, acc, query, decoders, selects) elif query.groupby: output = groupby_formatter(aggs, acc, query, decoders, selects) else: output = value_fomratter(aggs, acc, query, decoders, selects) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in agg_formatters: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def __eq__(self, other): if self is other: return True d = _get(self, "_dict") if not isinstance(d, dict): return d == other if other == None: return False if not isinstance(other, Mapping): return False e = unwrap(other) for k, v in d.items(): if e.get(k) != v: return False for k, v in e.items(): if d.get(k) != v: return False return True
def __eq__(self, other): if self is other: return True d = self._internal_dict if _get(d, CLASS) is not dict: return d == other if not d and other == None: return False if _get(other, CLASS) not in data_types: return False e = unwrap(other) for k, v in d.items(): if e.get(k) != v: return False for k, v in e.items(): if d.get(k) != v: return False return True
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if not isinstance(template, unicode): sys.stderr.write("Log.error was expecting a unicode template") Log.error("Log.error was expecting a unicode template") if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False cause = wrap( unwraplist( [Except.wrap(c, stack_depth=1) for c in listwrap(cause)])) trace = exceptions.extract_stack(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(exceptions.ERROR, template, params, cause, trace) raise e
def warning( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if not isinstance(template, text_type): Log.error("Log.warning was expecting a unicode template") if isinstance(default_params, BaseException): cause = default_params default_params = {} if "values" in more_params.keys(): Log.error("Can not handle a logging parameter by name `values`") params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = exceptions.extract_stack(stack_depth + 1) e = Except(type=exceptions.WARNING, template=template, params=params, cause=cause, trace=trace) Log.note( "{{error|unicode}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def add(self, val): val = datawrap(val) key = value2key(self._keys, val) if key == None: Log.error("Expecting key to be not None") d = self._data.get(key) if d is None: self._data[key] = unwrap(val) self.count += 1 elif d is not val: if self.fail_on_dup: Log.error( "{{new|json}} with key {{key|json}} already filled with {{old|json}}", key=key, new=val, old=self[val]) elif DEBUG: Log.warning( "key {{key|json}} already filled\nExisting\n{{existing|json|indent}}\nValue\n{{value|json|indent}}", key=key, existing=d, value=val)
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows num_rows = len(rows) table_name = concat_field(self.name, nested_path) if table_name == self.name: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column # ONLY THE PRIMITIVE VALUE COLUMNS command = ConcatSQL( SQL_INSERT, quote_column(table_name), sql_iso(sql_list(map(quote_column, all_columns))), SQL_VALUES, sql_list( sql_iso( sql_list(quote_value(row.get(c)) for c in all_columns)) for row in unwrap(rows))) with self.db.transaction() as t: t.execute(command)
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows table_name = concat_field(self.sf.fact, nested_path) if table_name == self.sf.fact: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column prefix = "INSERT INTO " + quote_table(table_name) + \ "(" + ",".join(map(quote_table, all_columns)) + ")" # BUILD THE RECORDS records = " UNION ALL ".join( "\nSELECT " + ",".join(quote_value(row.get(c)) for c in all_columns) for row in unwrap(rows)) self.db.execute(prefix + records)
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name], ) for d in data] else: path = split_field(field_name) output = [] for d in data: for p in path: d = _getdefault(d, p) output.append((d, )) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
if retry == None: retry = Data(times=1, sleep=0) elif isinstance(retry, Number): retry = Data(times=retry, sleep=1) else: retry = wrap(retry) if isinstance(retry.sleep, Duration): retry.sleep = retry.sleep.seconds set_default(retry, {"times": 1, "sleep": 0}) if b'json' in kwargs: kwargs[b'data'] = convert.value2json(kwargs[b'json']).encode("utf8") del kwargs[b'json'] try: headers = kwargs[b"headers"] = unwrap( coalesce(wrap(kwargs)[b"headers"], {})) set_default(headers, {b"accept-encoding": b"compress, gzip"}) if zip and len(coalesce(kwargs.get(b"data"))) > 1000: compressed = convert.bytes2zip(kwargs[b"data"]) headers[b'content-encoding'] = b'gzip' kwargs[b"data"] = compressed _to_ascii_dict(headers) else: _to_ascii_dict(headers) except Exception, e: Log.error("Request setup failure on {{url}}", url=url, cause=e) errors = [] for r in range(retry.times):
def to_python(self, not_null=False, boolean=False, many=False): return text_type(repr(unwrap(json2value(self.json))))
def get_selects(query): schema = query.frum.schema split_select = {".": ESSelect(".")} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var) ) if c.jx_type == NESTED: get_select(".").set_op = True new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, "pull": get_pull_source(c.es_column), } ) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, } ) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select(".").set_op = True new_select.append( { "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source("."), } ) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select(".").set_op = True for c in leaves: if ( len(c.nested_path) == 1 ): # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": lambda row: row._id, } ) elif c.jx_type == NESTED: get_select(".").set_op = True pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, } ) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0]) ), s_column, ) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field(s_column, unnest_path(c_nested_path)) ), ) new_select.append( { "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child, }, "pull": pull, } ) else: new_select.append( { "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 else: split_scripts = split_expression_by_path( select.value, schema, lang=Painless ) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text( Painless[first(script)].partial_eval().to_es_script(schema) ) } new_select.append( { "name": select.name, "pull": jx_expression_to_function( "fields." + literal_field(select.name) ), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select(".").set_op: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var)) ) else: Log.error("Do not know what to do") return new_select, split_select
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance( fields, list) and len(fields) == 1 and is_variable_name( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output} elif where["or"]: return { "or": [ unwrap(_where_terms(master, vv, schema)) for vv in where["or"] ] } elif where["and"]: return { "and": [ unwrap(_where_terms(master, vv, schema)) for vv in where["and"] ] } elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def extend(self, values): for v in values: _get(self, "list").append(unwrap(v)) return self
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif is_data(value): try: value = unwrap(value) items = sort_using_key(value.items(), lambda r: r[0]) values = [ encode_basestring(k) + PRETTY_COLON + pretty_json(v) for k, v in items if v != None ] if not values: return "{}" elif len(values) == 1: return "{" + values[0] + "}" else: return "{\n" + ",\n".join(indent(v) for v in values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not is_text(k) for k in value.keys()): Log.error("JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e) Log.error("problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e) elif value in (None, Null): return "null" elif value.__class__ in (binary_type, text): if is_binary(value): value = value.decode('utf8') try: if "\n" in value and value.strip(): return pretty_json({ "$concat": value.split("\n"), "separator": "\n" }) else: return quote(value) except Exception as e: from mo_logs import Log try: Log.note( "try explicit convert of string with length {{length}}", length=len(value)) acc = [QUOTE] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = text(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(QUOTE) output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException as f: Log.warning("can not convert {{type}} to json", type=f.__class__.__name__, cause=f) return "null" elif is_list(value): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max( *[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max( 1, min( ARRAY_MAX_COLUMNS, int( floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + PRETTY_COMMA.join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join( PRETTY_COMMA.join( j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns)) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning( "problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p)) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e) elif hasattr(value, '__data__'): d = value.__data__() return pretty_json(d) elif hasattr(value, '__json__'): j = value.__json__() if j == None: return " null " # TODO: FIND OUT WHAT CAUSES THIS return pretty_json(json_decoder(j)) elif scrub(value) is None: return "null" elif hasattr(value, '__iter__'): return pretty_json(list(value)) elif hasattr(value, '__call__'): return "null" else: try: if int(value) == value: return text(int(value)) except Exception: pass try: if float(value) == value: return text(float(value)) except Exception: pass return pypy_json_encode(value) except Exception as e: problem_serializing(value, e)
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): canonical = select = Data(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce( select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(unwrap(frum), "_normalize_select"): return frum._normalize_select(canonical) output = [] if not select.value or select.value == ".": output.extend([ set_default({ "name": c.name, "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() ]) elif isinstance(select.value, basestring): if select.value.endswith(".*"): base_name = select.value[:-2] canonical.name = coalesce(select.name, base_name, select.aggregate) value = jx_expression(select[:-2]) if not isinstance(value, Variable): Log.error("`*` over general expression not supported yet") output.append([ set_default( { "name": base_name, "value": LeavesOp("leaves", value), "format": "dict" # MARKUP FOR DECODING }, canonical) for c in frum.get_columns() if c.type not in STRUCT ]) else: output.extend([ set_default( { "name": base_name + "." + literal_field(c.name[len(base_name) + 1:]), "value": jx_expression(c.name) }, canonical) for c in frum.get_leaves() if c.name.startswith(base_name + ".") ]) else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self table = self.sf.tables[relative_field(frum, self.sf.fact)] schema = table.schema query = QueryOp.wrap(query, table=table, schema=schema) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if not _Column: _late_import() if is_text(select): canonical = select = Data(value=select) else: select = wrap(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce( select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(unwrap(frum), "_normalize_select"): return frum._normalize_select(canonical) output = [] if len(select) and not select.value: Log.error(BAD_SELECT, select=select) elif not select.value or select.value == ".": output.extend([ set_default( { "name": c.name, "value": jx_expression(c.name, schema=schema) }, canonical) for c in frum.get_leaves() ]) elif is_text(select.value): if select.value.endswith(".*"): canonical.name = coalesce(select.name, ".") value = jx_expression(select[:-2], schema=schema) if not is_op(value, Variable): Log.error("`*` over general expression not supported yet") output.append([ set_default( { "value": LeavesOp(value, prefix=select.prefix), "format": "dict" # MARKUP FOR DECODING }, canonical) for c in frum.get_columns() if c.jx_type not in STRUCT ]) else: Log.error("do not know what to do") else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value, schema=schema) output.append(canonical) output = wrap(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True test = unwrap(test) expected = unwrap(expected) try: if test is None and expected is None: return elif test is expected: return elif is_text(expected): assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif is_data(expected) and is_data(test): for k, v2 in unwrap(expected).items(): v1 = test.get(k) assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif is_data(expected): for k, v2 in expected.items(): if is_text(k): v1 = mo_dots.get_attr(test, literal_field(k)) else: v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif is_container(test) and isinstance(expected, set): test = set(wrap(t) for t in test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected ) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception as _: pass else: Log.error("Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test) elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"): if test.__class__.__name__ == "ndarray": # numpy test = test.tolist() elif test.__class__.__name__ == "DataFrame": # pandas test = test[test.columns[0]].values.tolist() elif test.__class__.__name__ == "Series": # pandas test = test.values.tolist() if not expected and test == None: return if expected == None: expected = [] # REPRESENT NOTHING for a, b in zip_longest(test, expected): assertAlmostEqual(a, b, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception as e: Log.error( "{{test|json|limit(10000)}} does not match expected {{expected|json|limit(10000)}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e )
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.sf.fact, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if not isinstance(data, Mapping): data = {".": data} for k, v in data.items(): insertion = doc_collection[nested_path[0]] cname = concat_field(full_path, literal_field(k)) value_type = get_type(v) if value_type is None: continue if value_type in STRUCT: c = unwraplist( [cc for cc in abs_schema[cname] if cc.type in STRUCT]) else: c = unwraplist([ cc for cc in abs_schema[cname] if cc.type == value_type ]) if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path, _ in nested_tables.items(): if startswith_field( cname, path) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column(names={".": cname}, type=value_type, es_column=typed_column(cname, value_type), es_index=table, nested_path=nested_path) abs_schema.add(cname, c) if value_type == "nested": nested_tables[cname] = "fake table" required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) abs_schema.remove(cname, c) required_changes.append({"nest": (c, nested_path[0])}) deep_c = Column(names={".": cname}, type=value_type, es_column=typed_column(cname, value_type), es_index=table, nested_path=nested_path) abs_schema.add(cname, deep_c) insertion.active_columns.add(deep_c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.type: row[c.es_column] = v
def drill_filter(esfilter, data): """ PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN TODO: FIX THIS MONUMENTALLY BAD IDEA """ esfilter = unwrap(esfilter) primary_nested = [] # track if nested, changes if not primary_column = [] # only one path allowed primary_branch = ( [] ) # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE: return True if filter is FALSE: return False filter = wrap(filter) if filter["and"]: result = True output = FlatList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if is_text(filter.missing): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if is_text(filter["exists"]): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter}) output = [] # A LIST OF OBJECTS MAKING THROUGH THE FILTER def main(sequence, esfilter, row, depth): """ RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED """ new_filter = pe_filter(esfilter, row, depth) if new_filter is True: seq = list(sequence) seq.append(row) output.append(seq) return elif new_filter is False: return seq = list(sequence) seq.append(row) for d in primary_branch[depth]: main(seq, new_filter, d, depth + 1) # OUTPUT for i, d in enumerate(data): if is_data(d): main([], esfilter, wrap(d), 0) else: Log.error("filter is expecting a dict, not {{type}}", type=d.__class__) # AT THIS POINT THE primary_column[] IS DETERMINED # USE IT TO EXPAND output TO ALL NESTED OBJECTS max = 0 # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max for i, n in enumerate(primary_nested): if n: max = i + 1 # OUTPUT IS A LIST OF ROWS, # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY uniform_output = FlatList() def recurse(row, depth): if depth == max: uniform_output.append(row) else: nested = row[-1][primary_column[depth]] if not nested: # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN for i in range(depth, max): row.append(None) uniform_output.append(row) else: for d in nested: r = list(row) r.append(d) recurse(r, depth + 1) for o in output: recurse(o, 0) if not max: # SIMPLE LIST AS RESULT return wrap([unwrap(u[0]) for u in uniform_output]) return PartFlatList(primary_column[0:max], uniform_output)
def extractor( guid, num_partitions, esq, query, selects, query_path, schema, chunk_size, cardinality, abs_limit, formatter, please_stop, ): total = 0 # WE MESS WITH THE QUERY LIMITS FOR CHUNKING query.limit = first(query.groupby).domain.limit = chunk_size * 2 start_time = Date.now() try: write_status( guid, { "status": "starting", "chunks": num_partitions, "rows": min(abs_limit, cardinality), "start_time": start_time, "timestamp": Date.now(), }, ) with TempFile() as temp_file: with open(temp_file.abspath, "wb") as output: for i in range(0, num_partitions): if please_stop: Log.error("request to shutdown!") is_last = i == num_partitions - 1 first(query.groupby).allowNulls = is_last acc, decoders, es_query = aggop_to_es_queries( selects, query_path, schema, query) # REACH INTO THE QUERY TO SET THE partitions terms = es_query.aggs._filter.aggs._match.terms terms.include.partition = i terms.include.num_partitions = num_partitions result = esq.es.search(deepcopy(es_query), query.limit) aggs = unwrap(result.aggregations) formatter.add(aggs, acc, query, decoders, selects) for b in formatter.bytes(): if b is DONE: break output.write(b) else: write_status( guid, { "status": "working", "chunk": i, "chunks": num_partitions, "row": total, "rows": min(abs_limit, cardinality), "start_time": start_time, "timestamp": Date.now(), }, ) continue break for b in formatter.footer(): output.write(b) upload(guid + ".json", temp_file) write_status( guid, { "ok": True, "status": "done", "chunks": num_partitions, "rows": min(abs_limit, cardinality), "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) except Exception as e: e = Except.wrap(e) write_status( guid, { "ok": False, "status": "error", "error": e, "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) Log.warning("Could not extract", cause=e)
def add(self, val): key = value2key(self._keys, val) e = self._data.get(key, []) self._data[key] = e e.append(unwrap(val)) self.count += 1