def _query(self, query): result = Data() curr = self.db.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() return result
def _send_email(self): try: if not self.accumulation: return with Closer(connect_to_region( self.settings.region, aws_access_key_id=unwrap(self.settings.aws_access_key_id), aws_secret_access_key=unwrap(self.settings.aws_secret_access_key) )) as conn: # WHO ARE WE SENDING TO emails = Data() for template, params in self.accumulation: content = expand_template(template, params) emails[literal_field(self.settings.to_address)] += [content] for c in self.cc: if any(c in params.params.error for c in c.contains): emails[literal_field(c.to_address)] += [content] # SEND TO EACH for to_address, content in emails.items(): conn.send_email( source=self.settings.from_address, to_addresses=listwrap(to_address), subject=self.settings.subject, body="\n\n".join(content), format="text" ) self.next_send = Date.now() + self.settings.max_interval self.accumulation = [] except Exception as e: self.next_send = Date.now() + self.settings.max_interval Log.warning("Could not send", e)
def _send_email(self): try: if not self.accumulation: return with Emailer(self.settings) as emailer: # WHO ARE WE SENDING TO emails = Data() for template, params in self.accumulation: content = expand_template(template, params) emails[literal_field(self.settings.to_address)] += [content] for c in self.cc: if any(d in params.params.error for d in c.contains): emails[literal_field(c.to_address)] += [content] # SEND TO EACH for to_address, content in emails.items(): emailer.send_email( from_address=self.settings.from_address, to_address=listwrap(to_address), subject=self.settings.subject, text_data="\n\n".join(content) ) self.accumulation = [] except Exception as e: Log.warning("Could not send", e) finally: self.next_send = Date.now() + self.settings.average_interval * (2 * Random.float())
def list(self, prefix=None, marker=None, delimiter=None): # https://s3.amazonaws.com/net-mozaws-stage-fx-test-activedata?marker=jenkins-go-bouncer.prod-3019/py27.log # <ListBucketResult> # <Name>net-mozaws-stage-fx-test-activedata</Name> # <Prefix/> # <Marker>jenkins-go-bouncer.prod-3019/py27.log</Marker> # <MaxKeys>1000</MaxKeys> # <IsTruncated>true</IsTruncated> # <Contents> # <Key>jenkins-go-bouncer.prod-3020/py27.log</Key> # <LastModified>2017-03-05T07:02:20.000Z</LastModified> # <ETag>"69dcb19e91eb3eec51e1b659801523d6"</ETag> # <Size>10037</Size> # <StorageClass>STANDARD</StorageClass> state = Data() state.prefix =prefix state.delimiter = delimiter state.marker = marker state.get_more = True def more(): xml = http.get(self.url + "?" + value2url_param(state)).content data = BeautifulSoup(xml, 'xml') state.get_more = data.find("istruncated").contents[0] == "true" contents = data.findAll("contents") state.marker = contents[-1].find("key").contents[0] return [{k: t(d.find(k).contents[0]) for k, t in content_keys.items()} for d in contents] while state.get_more: content = more() for c in content: yield wrap(c)
def __init__(self, **kwargs): Data.__init__(self) self.count = 0 self.mean = None self.variance = None self.skew = None self.kurtosis = None if "samples" in kwargs: s = ZeroMoment2Stats(ZeroMoment.new_instance(kwargs["samples"])) self.count = s.count self.mean = s.mean self.variance = s.variance self.skew = s.skew self.kurtosis = s.kurtosis return if "count" not in kwargs: self.count = 0 self.mean = None self.variance = None self.skew = None self.kurtosis = None elif "mean" not in kwargs: self.count = kwargs["count"] self.mean = None self.variance = None self.skew = None self.kurtosis = None elif "variance" not in kwargs and "std" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = 0 self.skew = None self.kurtosis = None elif "skew" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = None self.kurtosis = None elif "kurtosis" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = kwargs["skew"] self.kurtosis = None else: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = kwargs["skew"] self.kurtosis = kwargs["kurtosis"]
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream if isinstance(stream, text_type): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if isinstance(value, text_type): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception as f: if DEBUG_LOGGING: raise f
class FakeES(): @override def __init__(self, filename, host="fake", index="fake", kwargs=None): self.settings = kwargs self.file = File(filename) self.cluster= Null try: self.data = mo_json.json2value(self.file.read()) except Exception as e: self.data = Data() def search(self, query): query = wrap(query) f = jx.get(query.query.filtered.filter) filtered = wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total": len(filtered), "hits": [{"_id": d._id, "fields": unwrap(jx.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total": len(filtered), "hits": filtered}}) def extend(self, records): """ JUST SO WE MODEL A Queue """ records = { v["id"]: v["value"] if "value" in v else mo_json.json2value(v['json']) for v in records } unwrap(self.data).update(records) self.refresh() Log.note("{{num}} documents added", num=len(records)) def add(self, record): if isinstance(record, list): Log.error("no longer accepting lists, use extend()") return self.extend([record]) def delete_record(self, filter): f = esfilter2where(filter) self.data = wrap({k: v for k, v in self.data.items() if not f(v)}) def refresh(self, *args, **kwargs): data_as_json = mo_json.value2json(self.data, pretty=True) self.file.write(data_as_json) def set_refresh_interval(self, seconds): pass
def url_param2value(param): """ CONVERT URL QUERY PARAMETERS INTO DICT """ if isinstance(param, unicode): param = param.encode("ascii") def _decode(v): output = [] i = 0 while i < len(v): c = v[i] if c == "%": d = (v[i + 1:i + 3]).decode("hex") output.append(d) i += 3 else: output.append(c) i += 1 output = (b"".join(output)).decode("latin1") try: if not _Log: _late_import() return _json2value(output) except Exception: pass return output query = Data() for p in param.split(b'&'): if not p: continue if p.find(b"=") == -1: k = p v = True else: k, v = p.split(b"=") v = _decode(v) u = query.get(k) if u is None: query[k] = v elif isinstance(u, list): u += [v] else: query[k] = [u, v] return query
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, text_type): select = Data(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".", schema=schema) else: return Null elif isinstance(select.value, text_type): if select.value.endswith(".*"): name = select.value[:-2] output.name = coalesce(select.name, name) output.value = LeavesOp("leaves", Variable(name), prefix=coalesce(select.prefix, name)) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value, schema=schema) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value, schema=schema) elif isinstance(select.value, (int, float)): if not output.name: output.name = text_type(select.value) output.value = jx_expression(select.value, schema=schema) else: output.value = jx_expression(select.value, schema=schema) if not output.name: Log.error("expecting select to have a name: {{select}}", select= select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def url_param2value(param): """ CONVERT URL QUERY PARAMETERS INTO DICT """ if param == None: return Null if param == None: return Null def _decode(v): output = [] i = 0 while i < len(v): c = v[i] if c == "%": d = hex2chr(v[i + 1:i + 3]) output.append(d) i += 3 else: output.append(c) i += 1 output = text_type("".join(output)) try: return json2value(output) except Exception: pass return output query = Data() for p in param.split('&'): if not p: continue if p.find("=") == -1: k = p v = True else: k, v = p.split("=") v = _decode(v) u = query.get(k) if u is None: query[k] = v elif is_list(u): u += [v] else: query[k] = [u, v] return query
def __init__(self, filename, host="fake", index="fake", kwargs=None): self.settings = kwargs self.filename = kwargs.filename try: self.data = mo_json.json2value(File(self.filename).read()) except Exception: self.data = Data()
def __init__(self, stream): assert stream if isinstance(stream, text_type): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if isinstance(value, text_type): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def __init__(self, filename, host="fake", index="fake", kwargs=None): self.settings = kwargs self.file = File(filename) self.cluster= Null try: self.data = mo_json.json2value(self.file.read()) except Exception as e: self.data = Data()
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ if not isinstance(text, unicode): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def encrypt(text, _key, salt=None): """ RETURN {"salt":s, "length":l, "data":d} -> JSON -> UTF8 """ if is_text(text): encoding = 'utf8' data = bytearray(text.encode("utf8")) elif is_binary(text): encoding = None if PY2: data = bytearray(text) else: data = text if _key is None: Log.error("Expecting a key") if is_binary(_key): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) output.encoding = encoding encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output, pretty=True).encode('utf8') if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def __init__(self, _file): """ file - USES FILE FOR PERSISTENCE """ self.file = File.new_instance(_file) self.lock = Lock("lock for persistent queue using file " + self.file.name) self.please_stop = Signal() self.db = Data() self.pending = [] if self.file.exists: for line in self.file: with suppress_exception: delta = mo_json.json2value(line) apply_delta(self.db, delta) if self.db.status.start == None: # HAPPENS WHEN ONLY ADDED TO QUEUE, THEN CRASH self.db.status.start = 0 self.start = self.db.status.start # SCRUB LOST VALUES lost = 0 for k in self.db.keys(): with suppress_exception: if k!="status" and int(k) < self.start: self.db[k] = None lost += 1 # HAPPENS FOR self.db.status, BUT MAYBE OTHER PROPERTIES TOO if lost: Log.warning("queue file had {{num}} items lost", num= lost) DEBUG and Log.note("Persistent queue {{name}} found with {{num}} items", name=self.file.abspath, num=len(self)) else: self.db.status = Data( start=0, end=0 ) self.start = self.db.status.start DEBUG and Log.note("New persistent queue {{name}}", name=self.file.abspath)
def get_more(please_stop): more.append( es09.util.post(es, Data(filter=more_filter, fields=es_query.fields), query.limit))
class FakeES(): @override def __init__(self, filename, host="fake", index="fake", kwargs=None): self.settings = kwargs self.filename = filename try: self.data = mo_json.json2value(File(self.filename).read()) except Exception as e: self.data = Data() def search(self, query): query = wrap(query) f = jx.get(query.query.filtered.filter) filtered = wrap([{ "_id": i, "_source": d } for i, d in self.data.items() if f(d)]) if query.fields: return wrap({ "hits": { "total": len(filtered), "hits": [{ "_id": d._id, "fields": unwrap( jx.select([unwrap(d._source)], query.fields)[0]) } for d in filtered] } }) else: return wrap({"hits": {"total": len(filtered), "hits": filtered}}) def extend(self, records): """ JUST SO WE MODEL A Queue """ records = { v["id"]: v["value"] if "value" in v else mo_json.json2value(v['json']) for v in records } unwrap(self.data).update(records) data_as_json = mo_json.value2json(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} documents added", num=len(records)) def add(self, record): if isinstance(record, list): Log.error("no longer accepting lists, use extend()") return self.extend([record]) def delete_record(self, filter): f = convert.esfilter2where(filter) self.data = wrap({k: v for k, v in self.data.items() if not f(v)}) def set_refresh_interval(self, seconds): pass
def _convert_range(self, range): if range == None: return None return Data(min=range.min, max=range.max)
def __init__( self, host, index, # THE NAME OF THE SNOWFLAKE (IF WRITING) alias=None, # THE NAME OF THE SNOWFLAKE (FOR READING) type=None, name=None, # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE) port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.edges = Data() # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT self.worker = None self.settings = kwargs self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.name = name = self._namespace._find_alias( coalesce(alias, index, name)) if read_only: self.es = elasticsearch.Alias(alias=name, index=None, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index( read_only=read_only, kwargs=kwargs) self._ensure_max_result_window_set(name) self.settings.type = self.es.settings.type self.stats = QueryStats(self.es.cluster) columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error( "Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {'.': None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if v == '.': return ('.', ) return (v, ) + nested_path_of(all_paths[v]) query_paths = sort_using_key(set( step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p))) for step in query_paths: if step in all_paths: continue else: best = '.' for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): nested_path = nested_path_of(p) try: self.namespace.meta.columns.add( Column(name=p, es_column=p, es_index=self.name, es_type=OBJECT, jx_type=OBJECT, nested_path=nested_path, multi=1001 if last(split_field(p)) == NESTED_TYPE else None, last_updated=Date.now())) except Exception as e: raise e
from mo_json import json2value, value2json from mo_kwargs import override from mo_logs import Log from mo_logs.exceptions import Except from mo_threads import Lock, Till from mo_times import Timer, Duration from requests import Response, sessions from mo_http.big_data import ibytes2ilines, icompressed2ibytes, safe_size, ibytes2icompressed, bytes2zip, zip2bytes DEBUG = False FILE_SIZE_LIMIT = 100 * 1024 * 1024 MIN_READ_SIZE = 8 * 1024 ZIP_REQUEST = False default_headers = Data( ) # TODO: MAKE THIS VARIABLE A SPECIAL TYPE OF EXPECTED MODULE PARAMETER SO IT COMPLAINS IF NOT SET default_timeout = 600 DEFAULTS = { "allow_redirects": True, "stream": True, "verify": True, "timeout": 600, "zip": False, "retry": { "times": 1, "sleep": 0, "http": False } } _warning_sent = False request_count = 0
def select(self, fields): if is_data(fields): fields = fields.value if is_text(fields): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if is_list(fields): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def format_table(aggs, es_query, query, decoders, all_selects): new_edges = wrap(count_dim(aggs, es_query, decoders)) dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) rank = len(dims) header = tuple(new_edges.name + all_selects.name) name2index = {s.name: i + rank for i, s in enumerate(all_selects)} def data(): is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby if give_me_zeros: # WE REQUIRE THE ZEROS FOR SORTING all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS ordered_coord = all_coord.next()[::-1] output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != ordered_coord: # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES if output is not None: for s in all_selects: i = name2index[s.name] if output[i] is None: output[i] = s.default # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT ordered_coord = all_coord.next()[::-1] while coord != ordered_coord: # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects] yield record ordered_coord = all_coord.next()[::-1] # coord == missing_coord output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects] for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) yield output else: last_coord = None # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != last_coord: if output: # SET DEFAULTS for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output output = is_sent[coord] if output == None: output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects] last_coord = coord # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) if output: # SET DEFAULTS ON LAST ROW for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for coord, output in is_sent: if output == None: record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects] yield record return Data( meta={"format": "table"}, header=header, data=list(data()) )
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, ))
def construct_docs(self, cursor, append, please_stop): """ :param cursor: ITERATOR OF RECORD TUPLES :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT :return: (count, first, next, next_key) number of documents added the first document in the batch the first document of the next batch """ null_values = set(self.settings.null_values) | {None} doc_count = 0 columns = tuple(wrap(c) for c in self.columns) with Timer("Downloading from MySQL"): curr_doc = Null row_count = 0 for row in cursor: row_count += 1 if please_stop: Log.error("Got `please_stop` signal") nested_path = [] next_object = Data() for c, value in zip(columns, row): # columns ARE IN ORDER, FROM FACT ['.'] TO EVER-DEEPER-NESTED if value in null_values: # EVERY COLUMN THAT'S NOT NEEDED IS None continue if len(nested_path) < len(c.nested_path): # EACH COLUMN IS DEEPER THAN THE NEXT # THESE WILL BE THE id COLUMNS, WHICH ARE ALWAYS INCLUDED AND BEFORE ALL OTHER VALUES nested_path = unwrap(c.nested_path) next_object = Data() next_object[c.put] = value # OBJECT HAS BEEN CONSTRUCTED, LET'S PLACE IT WHERE IT BELONGS if len(nested_path) > 1: children = [curr_doc] steps = list(reversed(nested_path)) parent_path = steps[0] for path in steps[1:]: parent = children[-1] relative_path = relative_field(path, parent_path) children = unwrap(parent[relative_path]) if not children: children = parent[relative_path] = [] parent_path = path children.append(next_object) continue # THE TOP-LEVEL next_object HAS BEEN ENCOUNTERED, EMIT THE PREVIOUS, AND COMPLETED curr_doc if curr_doc == next_object: Log.error( "Expecting records. Did you select the wrong schema, or select records that do not exist?" ) if curr_doc: append(curr_doc["id"]) doc_count += 1 curr_doc = next_object # DEAL WITH LAST RECORD if curr_doc: append(curr_doc["id"]) doc_count += 1 Log.note( "{{doc_count}} documents ({{row_count}} db records)", doc_count=doc_count, row_count=row_count, )
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query( """ SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database, ) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: lhs, rhs = map(strings.trim, r.split("->")) lhs = lhs.split(".") if len(lhs) == 2: lhs = [self.settings.database.schema] + lhs rhs = rhs.split(".") if len(rhs) == 2: rhs = [self.settings.database.schema] + rhs to_add = Data( ordinal_position=1, # CAN ONLY HANDLE 1-COLUMN RELATIONS table_schema=lhs[0], table_name=lhs[1], column_name=lhs[2], referenced_table_schema=rhs[0], referenced_table_name=rhs[1], referenced_column_name=rhs[2], ) # CHECK IF EXISTING if jx.filter(raw_relations, {"eq": to_add}): Log.note("Relation {{relation}} already exists", relation=r) continue to_add.constraint_name = Random.hex(20) raw_relations.append(to_add) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select( raw_relations, [ { "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }, ], ) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if (r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name): ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index), # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index], }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id, } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i, }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = {( r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema, ) for r in relations} related_column_table_schema_triples = ( foreign_column_table_schema_triples | referenced_column_table_schema_triples) columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif ( c.column_name, c.table_name, c.table_schema, ) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign, # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path, }] todo.append( Data( position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False, )) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact_name): Log.error("Expecting table, or some nested table") query = QueryOp.wrap(query, self, self.namespace) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, self.schema) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, self.schema) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, self.schema) command = create_table + op else: op = self._set_op(query) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif is_op(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif is_op(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if is_list(query.select): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif is_list(data[c.push_name]): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def streamer(): for row in self.cursor: output = Data() for c, v in zip(columns, row): output[c] = v yield output
class PersistentQueue(object): """ THREAD-SAFE, PERSISTENT QUEUE CAN HANDLE MANY PRODUCERS, BUT THE pop(), commit() IDIOM CAN HANDLE ONLY ONE CONSUMER. IT IS IMPORTANT YOU commit() or close(), OTHERWISE NOTHING COMES OFF THE QUEUE """ def __init__(self, _file): """ file - USES FILE FOR PERSISTENCE """ self.file = File.new_instance(_file) self.lock = Lock("lock for persistent queue using file " + self.file.name) self.please_stop = Signal() self.db = Data() self.pending = [] if self.file.exists: for line in self.file: with suppress_exception: delta = mo_json.json2value(line) apply_delta(self.db, delta) if self.db.status.start == None: # HAPPENS WHEN ONLY ADDED TO QUEUE, THEN CRASH self.db.status.start = 0 self.start = self.db.status.start # SCRUB LOST VALUES lost = 0 for k in self.db.keys(): with suppress_exception: if k!="status" and int(k) < self.start: self.db[k] = None lost += 1 # HAPPENS FOR self.db.status, BUT MAYBE OTHER PROPERTIES TOO if lost: Log.warning("queue file had {{num}} items lost", num= lost) DEBUG and Log.note("Persistent queue {{name}} found with {{num}} items", name=self.file.abspath, num=len(self)) else: self.db.status = Data( start=0, end=0 ) self.start = self.db.status.start DEBUG and Log.note("New persistent queue {{name}}", name=self.file.abspath) def _add_pending(self, delta): delta = wrap(delta) self.pending.append(delta) def _apply_pending(self): for delta in self.pending: apply_delta(self.db, delta) self.pending = [] def __iter__(self): """ BLOCKING ITERATOR """ while not self.please_stop: try: value = self.pop() if value is not THREAD_STOP: yield value except Exception as e: Log.warning("Tell me about what happened here", cause=e) def add(self, value): with self.lock: if self.closed: Log.error("Queue is closed") if value is THREAD_STOP: DEBUG and Log.note("Stop is seen in persistent queue") self.please_stop.go() return self._add_pending({"add": {str(self.db.status.end): value}}) self.db.status.end += 1 self._add_pending({"add": {"status.end": self.db.status.end}}) self._commit() return self def __len__(self): with self.lock: return self.db.status.end - self.start def __getitem__(self, item): return self.db[str(item + self.start)] def pop(self, timeout=None): """ :param timeout: OPTIONAL DURATION :return: None, IF timeout PASSES """ with self.lock: while not self.please_stop: if self.db.status.end > self.start: value = self.db[str(self.start)] self.start += 1 return value if timeout is not None: with suppress_exception: self.lock.wait(timeout=timeout) if self.db.status.end <= self.start: return None else: self.lock.wait() DEBUG and Log.note("persistent queue already stopped") return THREAD_STOP def pop_all(self): """ NON-BLOCKING POP ALL IN QUEUE, IF ANY """ with self.lock: if self.please_stop: return [THREAD_STOP] if self.db.status.end == self.start: return [] output = [] for i in range(self.start, self.db.status.end): output.append(self.db[str(i)]) self.start = self.db.status.end return output def rollback(self): with self.lock: if self.closed: return self.start = self.db.status.start self.pending = [] def commit(self): with self.lock: if self.closed: Log.error("Queue is closed, commit not allowed") try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) if self.db.status.end - self.start < 10 or Random.range(0, 1000) == 0: # FORCE RE-WRITE TO LIMIT FILE SIZE # SIMPLY RE-WRITE FILE if DEBUG: Log.note("Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start) for k in self.db.keys(): if k == "status" or int(k) >= self.db.status.start: continue Log.error("Not expecting {{key}}", key=k) self._commit() self.file.write(mo_json.value2json({"add": self.db}) + "\n") else: self._commit() except Exception as e: raise e def _commit(self): self.file.append("\n".join(mo_json.value2json(p) for p in self.pending)) self._apply_pending() def close(self): self.please_stop.go() with self.lock: if self.db is None: return self.add(THREAD_STOP) if self.db.status.end == self.start: DEBUG and Log.note("persistent queue clear and closed") self.file.delete() else: DEBUG and Log.note("persistent queue closed with {{num}} items left", num=len(self)) try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) self.file.write(mo_json.value2json({"add": self.db}) + "\n" + ("\n".join(mo_json.value2json(p) for p in self.pending)) + "\n") self._apply_pending() except Exception as e: raise e self.db = None @property def closed(self): with self.lock: return self.db is None
def construct_docs(self, cursor, append, please_stop): """ :param cursor: ITERATOR OF RECORDS :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT :return: (count, first, next, next_key) number of documents added the first document in the batch the first document of the next batch """ null_values = set(self.settings.snowflake.null_values) | {None} count = 0 rownum = 0 columns = tuple(wrap(c) for c in self.schema.columns) with Timer("Downloading from MySQL"): curr_record = Null for rownum, row in enumerate(cursor): if please_stop: Log.error("Got `please_stop` signal") nested_path = [] next_record = None for c, value in zip(columns, row): if value in null_values: continue if len(nested_path) < len(c.nested_path): nested_path = unwrap(c.nested_path) next_record = Data() next_record[c.put] = value if len(nested_path) > 1: path = nested_path[-2] children = curr_record[path] if children == None: children = curr_record[path] = wrap([]) if len(nested_path) > 2: parent_path = path for path in list(reversed(nested_path[0:-2:])): parent = children.last() relative_path = relative_field(path, parent_path) children = parent[relative_path] if children == None: children = parent[relative_path] = wrap([]) parent_path = path children.append(next_record) continue if curr_record == next_record: Log.error("not expected") if curr_record: append(curr_record["id"], count) count += 1 curr_record = next_record # DEAL WITH LAST RECORD if curr_record: append(curr_record["id"], count) count += 1 Log.note("{{num}} documents ({{rownum}} db records)", num=count, rownum=rownum)
def life_cycle_watcher(please_stop): bad_requests = Data() setup_threads = [] last_get = Date.now() setup_in_progress = set() while not please_stop: spot_requests = self._get_managed_spot_requests() instances = wrap({ i.id: i for r in self.ec2_conn.get_all_instances() for i in r.instances }) # INSTANCES THAT REQUIRE SETUP time_to_stop_trying = {} please_setup = [ (i, r) for i, r in [(instances[r.instance_id], r) for r in spot_requests] if i.id and (not i.tags.get("Name") or i.tags.get( "Name") == self.settings.ec2.instance.name + " (setup)") and i.id not in setup_in_progress and i._state.name == "running" and Date.now() > Date(i.launch_time) + DELAY_BEFORE_SETUP ] for i, r in please_setup: if not time_to_stop_trying.get(i.id): time_to_stop_trying[ i.id] = Date.now() + TIME_FROM_RUNNING_TO_LOGIN if Date.now() > time_to_stop_trying[i.id]: # FAIL TO SETUP AFTER x MINUTES, THEN TERMINATE INSTANCE self.ec2_conn.terminate_instances(instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) Log.warning( "Problem with setup of {{instance_id}}. Time is up. Instance TERMINATED!", instance_id=i.id) continue try: p = self.settings.utility[i.instance_type] if p == None: try: self.ec2_conn.terminate_instances( instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) finally: Log.error( "Can not setup unknown {{instance_id}} of type {{type}}", instance_id=i.id, type=i.instance_type) i.markup = p i.add_tag("Name", self.settings.ec2.instance.name + " (setup)") setup_in_progress.add(i.id) t = Thread.run("setup for " + text(i.id), track_setup, self.instance_manager.setup, r, i, p) if SINGLE_THREAD_SETUP: t.join() setup_threads.append(t) except Exception as e: i.add_tag("Name", "") Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e) if Date.now() - last_get > 5 * SECOND: # REFRESH STALE spot_requests = self._get_managed_spot_requests() last_get = Date.now() pending = wrap([ r for r in spot_requests if r.status.code in PENDING_STATUS_CODES ]) give_up = wrap([ r for r in spot_requests if (r.status.code in PROBABLY_NOT_FOR_A_WHILE | TERMINATED_STATUS_CODES) and r.id not in bad_requests ]) ignore = wrap([ r for r in spot_requests if r.status.code in MIGHT_HAPPEN ]) # MIGHT HAPPEN, BUT NO NEED TO WAIT FOR IT if self.done_making_new_spot_requests: with self.net_new_locker: expired = Date.now( ) - self.settings.run_interval + 2 * MINUTE for ii in list(self.net_new_spot_requests): if Date(ii.create_time) < expired: # SOMETIMES REQUESTS NEVER GET INTO THE MAIN LIST OF REQUESTS self.net_new_spot_requests.remove(ii) for g in ignore: self.net_new_spot_requests.remove(g.id) pending = UniqueIndex(("id", ), data=pending) pending = pending | self.net_new_spot_requests if give_up: self.ec2_conn.cancel_spot_instance_requests( request_ids=give_up.id) Log.note( "Cancelled spot requests {{spots}}, {{reasons}}", spots=give_up.id, reasons=give_up.status.code) for g in give_up: bad_requests[g.id] += 1 if g.id in self.net_new_spot_requests: self.net_new_spot_requests.remove(g.id) if g.status.code == "capacity-not-available": self.no_capacity[ g.launch_specification. instance_type] = Date.now() if g.status.code == "bad-parameters": self.no_capacity[ g.launch_specification. instance_type] = Date.now() Log.warning( "bad parameters while requesting type {{type}}", type=g.launch_specification. instance_type) if not pending and self.done_making_new_spot_requests: Log.note("No more pending spot requests") break elif pending: Log.note("waiting for spot requests: {{pending}}", pending=[p.id for p in pending]) (Till(seconds=10) | please_stop).wait() with Timer("Save no capacity to file"): table = [{ "instance_type": k, "last_failure": v } for k, v in self.no_capacity.items()] self.no_capacity_file.write(value2json(table, pretty=True)) # WAIT FOR SETUP TO COMPLETE for t in setup_threads: t.join() Log.note("life cycle watcher has stopped")
def query_metadata(self, query): frum, query['from'] = query['from'], self schema = self.sf.tables["."].schema query = QueryOp.wrap(query, schema) columns = self.sf.columns where = query.where table_name = None column_name = None if query.edges or query.groupby: Log.error("Aggregates(groupby or edge) are not supported") if where.op == "eq" and where.lhs.var == "table": table_name = mo_json.json2value(where.rhs.json) elif where.op == "eq" and where.lhs.var == "name": column_name = mo_json.json2value(where.rhs.json) else: Log.error("Only simple filters are expected like: \"eq\" on table and column name") tables = [concat_field(self.sf.fact_name, i) for i in self.tables.keys()] metadata = [] if columns[-1].es_column != GUID: columns.append(Column( name=GUID, jx_type=STRING, es_column=GUID, es_index=self.sf.fact_name, nested_path=["."] )) for tname, table in zip(t, tables): if table_name != None and table_name != table: continue for col in columns: cname, ctype = untyped_column(col.es_column) if column_name != None and column_name != cname: continue metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path))) if query.format == "cube": num_rows = len(metadata) header = ["table", "name", "type", "nested_path"] temp_data = dict(zip(header, zip(*metadata))) return Data( meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "table"}, header=header, data=metadata ) else: header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "list"}, data=[dict(zip(header, r)) for r in metadata] )
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
class PersistentQueue(object): """ THREAD-SAFE, PERSISTENT QUEUE CAN HANDLE MANY PRODUCERS, BUT THE pop(), commit() IDIOM CAN HANDLE ONLY ONE CONSUMER. IT IS IMPORTANT YOU commit() or close(), OTHERWISE NOTHING COMES OFF THE QUEUE """ def __init__(self, _file): """ file - USES FILE FOR PERSISTENCE """ self.file = File.new_instance(_file) self.lock = Lock("lock for persistent queue using file " + self.file.name) self.please_stop = Signal() self.db = Data() self.pending = [] if self.file.exists: for line in self.file: with suppress_exception: delta = mo_json.json2value(line) apply_delta(self.db, delta) if self.db.status.start == None: # HAPPENS WHEN ONLY ADDED TO QUEUE, THEN CRASH self.db.status.start = 0 self.start = self.db.status.start # SCRUB LOST VALUES lost = 0 for k in self.db.keys(): with suppress_exception: if k != "status" and int(k) < self.start: self.db[k] = None lost += 1 # HAPPENS FOR self.db.status, BUT MAYBE OTHER PROPERTIES TOO if lost: Log.warning("queue file had {{num}} items lost", num=lost) if DEBUG: Log.note("Persistent queue {{name}} found with {{num}} items", name=self.file.abspath, num=len(self)) else: self.db.status = Data(start=0, end=0) self.start = self.db.status.start if DEBUG: Log.note("New persistent queue {{name}}", name=self.file.abspath) def _add_pending(self, delta): delta = wrap(delta) self.pending.append(delta) def _apply_pending(self): for delta in self.pending: apply_delta(self.db, delta) self.pending = [] def __iter__(self): """ BLOCKING ITERATOR """ while not self.please_stop: try: value = self.pop() if value is not THREAD_STOP: yield value except Exception as e: Log.warning("Tell me about what happened here", cause=e) if DEBUG: Log.note("queue iterator is done") def add(self, value): with self.lock: if self.closed: Log.error("Queue is closed") if value is THREAD_STOP: if DEBUG: Log.note("Stop is seen in persistent queue") self.please_stop.go() return self._add_pending({"add": {str(self.db.status.end): value}}) self.db.status.end += 1 self._add_pending({"add": {"status.end": self.db.status.end}}) self._commit() return self def __len__(self): with self.lock: return self.db.status.end - self.start def __getitem__(self, item): return self.db[str(item + self.start)] def pop(self, timeout=None): """ :param timeout: OPTIONAL DURATION :return: None, IF timeout PASSES """ with self.lock: while not self.please_stop: if self.db.status.end > self.start: value = self.db[str(self.start)] self.start += 1 return value if timeout is not None: with suppress_exception: self.lock.wait(timeout=timeout) if self.db.status.end <= self.start: return None else: self.lock.wait() if DEBUG: Log.note("persistent queue already stopped") return THREAD_STOP def pop_all(self): """ NON-BLOCKING POP ALL IN QUEUE, IF ANY """ with self.lock: if self.please_stop: return [THREAD_STOP] if self.db.status.end == self.start: return [] output = [] for i in range(self.start, self.db.status.end): output.append(self.db[str(i)]) self.start = self.db.status.end return output def rollback(self): with self.lock: if self.closed: return self.start = self.db.status.start self.pending = [] def commit(self): with self.lock: if self.closed: Log.error("Queue is closed, commit not allowed") try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) if self.db.status.end - self.start < 10 or Random.range( 0, 1000) == 0: # FORCE RE-WRITE TO LIMIT FILE SIZE # SIMPLY RE-WRITE FILE if DEBUG: Log.note( "Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start) for k in self.db.keys(): if k == "status" or int(k) >= self.db.status.start: continue Log.error("Not expecting {{key}}", key=k) self._commit() self.file.write( mo_json.value2json({"add": self.db}) + "\n") else: self._commit() except Exception as e: raise e def _commit(self): self.file.append("\n".join( mo_json.value2json(p) for p in self.pending)) self._apply_pending() def close(self): self.please_stop.go() with self.lock: if self.db is None: return self.add(THREAD_STOP) if self.db.status.end == self.start: if DEBUG: Log.note("persistent queue clear and closed") self.file.delete() else: if DEBUG: Log.note("persistent queue closed with {{num}} items left", num=len(self)) try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) self.file.write( mo_json.value2json({"add": self.db}) + "\n" + ("\n".join( mo_json.value2json(p) for p in self.pending)) + "\n") self._apply_pending() except Exception as e: raise e self.db = None @property def closed(self): with self.lock: return self.db is None
def format_list(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos( ) # TRACK THE EXPECTED COMBINATIONS for _, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value( missing_coord[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output missing_coord = all_coord.next() output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output else: for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output output = Data(meta={"format": "list"}, data=list(data())) return output
def __data__(self): output = Data({k:getattr(self,k) for k in vars(self)}) output.cause=unwraplist([c.__data__() for c in listwrap(output.cause)]) return output
def get_more(please_stop): more.append( es.search( Data(query=more_filter, stored_fields=es_query.stored_fields)))
elif depth == 1: return _MAX(cube) else: return _MAX(_max(depth - 1, c) for c in cube) def _min(depth, cube): if depth == 0: return cube elif depth == 1: return _MIN(cube) else: return _MIN(_min(depth - 1, c) for c in cube) aggregates = Data(max=_max, maximum=_max, min=_min, minimum=_min) def _iter(cube, depth): if depth == 1: return cube.__iter__() else: def iterator(): for c in cube: for b in _iter(c, depth - 1): yield b return iterator()
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data( ) # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance( s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function( {"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_" + literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: es_cols = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_count") if es_col.type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = { es_col.es_column: { "gt": 0 } } else: canonical_names.append(cn + ".value") es_query.aggs[cn].value_count.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function( {"add": canonical_names}) elif s.aggregate == "median": if len(es_cols) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": if len(es_cols) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance( s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error( "Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function( key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({ "add": [cn + ".value" for cn in canonical_names], "default": 0 }) elif s.aggregate == "stats": if len(es_cols) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = es_cols[ 0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = es_cols[ 0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for es_col in es_cols: script = { "scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc[' + quote(es_col.es_column) + '].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', } } stats_name = encode_property(es_col.es_column) if es_col.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append( jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": { "path": es_col.nested_path[0] }, "aggs": { "_nested": script } } pulls.append( jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(es_cols) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field( canonical_name )].extended_stats.field = es_cols[0].es_column s.pull = jx_expression_to_function({ "coalesce": [ literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default ] }) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field( canonical_name)].value_count.script = s.value.partial_eval( ).to_painless(schema).script(schema) s.pull = jx_expression_to_function( literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_painless( schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_painless( schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function( key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_painless( schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[ stats_name].extended_stats.script = s.value.to_painless( schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[ median_name].percentiles.script = s.value.to_painless( schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_painless( schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[ canonical_name].extended_stats.script = s.value.to_painless( schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({ "aggs": { "_nested": set_default({"nested": { "path": schema.query_path }}, es_query) } }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def _output(): for g, v in itertools.groupby(data, get_key): group = Data() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v)))
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { c.names["."]: _counting_query(c) }, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": { "path": c.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": c.es_column, "size": 0 } } } } else: query.aggs[literal_field(c.names["."])] = { "terms": { "field": c.es_column, "size": 0 } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith( TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
def apply_diff(text, diff, reverse=False, verify=True): """ SOME EXAMPLES OF diff #@@ -1 +1 @@ #-before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. #+before china goes live (end January developer release, June general audience release) , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -0,0 +1,3 @@ +before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. + +kward has the details. @@ -1 +1 @@ -before china goes live (end January developer release, June general audience release), the content team will have to manually update the settings for the china-ready apps currently in marketplace. +before china goes live , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -3 +3 ,6 @@ -kward has the details.+kward has the details. + +Target Release Dates : +https://mana.mozilla.org/wiki/display/PM/Firefox+OS+Wave+Launch+Cross+Functional+View + +Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40 """ if not diff: return text output = text hunks = [ (new_diff[start_hunk], new_diff[start_hunk + 1:end_hunk]) for new_diff in [[ d.lstrip() for d in diff if d.lstrip() and d != "\\ No newline at end of file" ] + ["@@"]] # ANOTHER REPAIR for start_hunk, end_hunk in pairwise( i for i, l in enumerate(new_diff) if l.startswith('@@')) ] for header, hunk_body in (reversed(hunks) if reverse else hunks): matches = DIFF_PREFIX.match(header.strip()) if not matches: if not _Log: _late_import() _Log.error("Can not handle \n---\n{{diff}}\n---\n", diff=diff) removes = tuple(int(i.strip()) for i in matches.group(1).split( ",")) # EXPECTING start_line, length TO REMOVE remove = Data( start=removes[0], length=1 if len(removes) == 1 else removes[1]) # ASSUME FIRST LINE adds = tuple(int(i.strip()) for i in matches.group(2).split( ",")) # EXPECTING start_line, length TO ADD add = Data(start=adds[0], length=1 if len(adds) == 1 else adds[1]) if add.length == 0 and add.start == 0: add.start = remove.start def repair_hunk(hunk_body): # THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST # ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE # EXAMPLE: -kward has the details.+kward has the details. # DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF if reverse: last_lines = [ o for b, o in zip(reversed(hunk_body), reversed(output)) if b != "+" + o ] if not last_lines: return hunk_body last_line = last_lines[0] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('-') and problem_line.endswith( '+' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('+' + last_line + "-"): split_point = len(last_line) + 1 break else: return hunk_body else: if not output: return hunk_body last_line = output[-1] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('+') and problem_line.endswith( '-' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('-' + last_line + "+"): split_point = len(last_line) + 1 break else: return hunk_body new_hunk_body = ( hunk_body[:problem_index] + [problem_line[:split_point], problem_line[split_point:]] + hunk_body[problem_index + 1:]) return new_hunk_body hunk_body = repair_hunk(hunk_body) if reverse: new_output = (output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '-'] + output[add.start + add.length - 1:]) else: new_output = (output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '+'] + output[add.start + remove.length - 1:]) output = new_output if verify: original = apply_diff(output, diff, not reverse, False) if set(text) != set(original): # bugzilla-etl diffs are a jumble for t, o in zip_longest(text, original): if t in ['reports: https://goo.gl/70o6w6\r']: break # KNOWN INCONSISTENCIES if t != o: if not _Log: _late_import() _Log.error("logical verification check failed") break return output
def __getitem__(self, item): if item == "from": return self.frum return Data.__getitem__(self, item)
def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) schema = self.sf.tables[primary_nested_path].schema nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } active_columns = {".": []} for cname, cols in schema.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) for nested_path, s in self.sf.tables.items(): for cname, cols in s.schema.items(): if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].append(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) selects = [] primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.sf.tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] if nested_path=="." and quoted_GUID in vars_: column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_GUID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name="_id", push_column_name="_id", push_column=0, push_child=".", sql=sql_select, pull=get_column(column_number), type="string", column_alias=_make_column_name(column_number), nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) query.select = [s for s in listwrap(query.select) if s.name!="_id"] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path !=".": index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if len(active_columns[nested_path]) != 0: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(schema) if isinstance(s.value, LeavesOp): for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(s.name, column.name))), push_column_name=get_property_name(concat_field(s.name, column.name)), push_column=si, push_child=".", pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=column.name, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=relative_field(c.names["."], s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, column_alias=column_alias, nested_path=nested_path ) where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(COLUMN + text_type(index_to_uid[n])) ordered_sql = ( "SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit) ) self.db.create_new_functions() #creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None]*num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = Data() column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = Data() for rownum, d in enumerate(data): for k, v in d.items(): if temp_data[k] == None: temp_data[k] = [None] * num_rows temp_data[k][rownum] = v return Data( meta={"format": "cube"}, data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols])+1 header = [None]*num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) column_names= [None]*(max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row =[None] * len(column_names) for i, (k, v) in enumerate(sorted(d.items())): for c in cols: if k==c.push_name: row[c.push_column] = v temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) elif not isinstance(query.select, list): # select is value type row[c.push_child]=c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data=[] for rownum, d in enumerate(data): row = {} for k, v in d.items(): for c in cols: if c.push_name==c.push_column_name==k: row[c.push_column_name] = v elif c.push_name==k and c.push_column_name!=k: row[c.push_column_name] = v temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )
def apply_diff(text, diff, reverse=False, verify=True): """ SOME EXAMPLES OF diff #@@ -1 +1 @@ #-before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. #+before china goes live (end January developer release, June general audience release) , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -0,0 +1,3 @@ +before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. + +kward has the details. @@ -1 +1 @@ -before china goes live (end January developer release, June general audience release), the content team will have to manually update the settings for the china-ready apps currently in marketplace. +before china goes live , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -3 +3 ,6 @@ -kward has the details.+kward has the details. + +Target Release Dates : +https://mana.mozilla.org/wiki/display/PM/Firefox+OS+Wave+Launch+Cross+Functional+View + +Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40 """ if not diff: return text output = text hunks = [ (new_diff[start_hunk], new_diff[start_hunk+1:end_hunk]) for new_diff in [[d.lstrip() for d in diff if d.lstrip() and d != "\\ No newline at end of file"] + ["@@"]] # ANOTHER REPAIR for start_hunk, end_hunk in pairwise(i for i, l in enumerate(new_diff) if l.startswith('@@')) ] for header, hunk_body in (reversed(hunks) if reverse else hunks): matches = DIFF_PREFIX.match(header.strip()) if not matches: if not _Log: _late_import() _Log.error("Can not handle \n---\n{{diff}}\n---\n", diff=diff) removes = tuple(int(i.strip()) for i in matches.group(1).split(",")) # EXPECTING start_line, length TO REMOVE remove = Data(start=removes[0], length=1 if len(removes) == 1 else removes[1]) # ASSUME FIRST LINE adds = tuple(int(i.strip()) for i in matches.group(2).split(",")) # EXPECTING start_line, length TO ADD add = Data(start=adds[0], length=1 if len(adds) == 1 else adds[1]) if add.length == 0 and add.start == 0: add.start = remove.start def repair_hunk(hunk_body): # THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST # ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE # EXAMPLE: -kward has the details.+kward has the details. # DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF if reverse: last_lines = [ o for b, o in zip(reversed(hunk_body), reversed(output)) if b != "+" + o ] if not last_lines: return hunk_body last_line = last_lines[0] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('-') and problem_line.endswith('+' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('+' + last_line + "-"): split_point = len(last_line) + 1 break else: return hunk_body else: if not output: return hunk_body last_line = output[-1] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('+') and problem_line.endswith('-' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('-' + last_line + "+"): split_point = len(last_line) + 1 break else: return hunk_body new_hunk_body = ( hunk_body[:problem_index] + [problem_line[:split_point], problem_line[split_point:]] + hunk_body[problem_index + 1:] ) return new_hunk_body hunk_body = repair_hunk(hunk_body) if reverse: new_output = ( output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '-'] + output[add.start + add.length - 1:] ) else: new_output = ( output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '+'] + output[add.start + remove.length - 1:] ) output = new_output if verify: original = apply_diff(output, diff, not reverse, False) if set(text) != set(original): # bugzilla-etl diffs are a jumble for t, o in zip_longest(text, original): if t in ['reports: https://goo.gl/70o6w6\r']: break # KNOWN INCONSISTENCIES if t != o: if not _Log: _late_import() _Log.error("logical verification check failed") break return output
def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output
def _get_single_branch_from_hg(settings, description, dir): if dir == "users": return [] response = http.get(settings.url + "/" + dir) doc = BeautifulSoup(response.all_content, "html.parser") output = [] try: all_branches = doc("table")[0] except Exception: return [] for i, b in enumerate(all_branches("tr")): if i == 0: continue # IGNORE HEADER columns = b("td") try: path = columns[0].a.get('href') if path == "/": continue name, desc, last_used = [c.text.strip() for c in columns][0:3] if last_used.startswith('at'): last_used = last_used[2:] detail = Data( name=name.lower(), locale=DEFAULT_LOCALE, parent_name=description, url=settings.url + path, description=desc, last_used=Date(last_used), etl={"timestamp": Date.now()} ) if detail.description == "unknown": detail.description = None # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR if path in [ "/projects/dxr/", # moved to webtools "/build/compare-locales/", # ?build team likes to clone? "/build/puppet/", # ?build team likes to clone? "/SeaMonkey/puppet/", # looses the popularity contest "/releases/gaia-l10n/v1_2/en-US/", # use default branch "/releases/gaia-l10n/v1_3/en-US/", # use default branch "/releases/gaia-l10n/v1_4/en-US/", # use default branch "/releases/gaia-l10n/v2_0/en-US/", # use default branch "/releases/gaia-l10n/v2_1/en-US/", # use default branch "/build/autoland/" ]: continue # MARKUP BRANCH IF LOCALE SPECIFIC if path.startswith("/l10n-central"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "mozilla-central" elif path.startswith("/releases/l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = _path[-2].lower() elif path.startswith("/releases/gaia-l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "gaia-" + _path[-2][1::] elif path.startswith("/weave-l10n"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "weave" if BRANCH_WHITELIST is not None: found = False for br in BRANCH_WHITELIST: if br in str(detail.name): found = True break if not found: continue Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale) output.append(detail) except Exception as e: Log.warning("branch digestion problem", cause=e) return output
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if isinstance(e.value, (text_type, binary_type)): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, LeavesOp): return object.__new__(ObjectDecoder, e) elif isinstance(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(isinstance(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder, e) elif isinstance(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder, e) if len(cols) != 1: return object.__new__(ObjectDecoder, e) col = cols[0] limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: if col.multi > 1 and len(col.partitions) < 6: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) # [0] is a cheat; each es_column should be a dict of columns keyed on type, like in sqlite es_column_map = {v: frum.schema[v][0].es_column for v in query.vars()} es_query = Data() new_select = Data() #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_"+literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if isinstance(abs_value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": frum.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def parse(json, query_path, expected_vars=NO_VARS): """ INTENDED TO TREAT JSON AS A STREAM; USING MINIMAL MEMORY WHILE IT ITERATES THROUGH THE STRUCTURE. ASSUMING THE JSON IS LARGE, AND HAS A HIGH LEVEL ARRAY STRUCTURE, IT WILL yield EACH OBJECT IN THAT ARRAY. NESTED ARRAYS ARE HANDLED BY REPEATING THE PARENT PROPERTIES FOR EACH MEMBER OF THE NESTED ARRAY. DEEPER NESTED PROPERTIES ARE TREATED AS PRIMITIVE VALUES; THE STANDARD JSON DECODER IS USED. LARGE MANY-PROPERTY OBJECTS CAN BE HANDLED BY `items()` :param json: SOME STRING-LIKE STRUCTURE THAT CAN ASSUME WE LOOK AT ONE CHARACTER AT A TIME, IN ORDER :param query_path: A DOT-SEPARATED STRING INDICATING THE PATH TO THE NESTED ARRAY OPTIONALLY, {"items":query_path} TO FURTHER ITERATE OVER PROPERTIES OF OBJECTS FOUND AT query_path :param expected_vars: REQUIRED PROPERTY NAMES, USED TO DETERMINE IF MORE-THAN-ONE PASS IS REQUIRED :return: RETURNS AN ITERATOR OVER ALL OBJECTS FROM ARRAY LOCATED AT query_path """ if hasattr(json, "read"): # ASSUME IT IS A STREAM temp = json def get_more(): return temp.read(MIN_READ_SIZE) json = List_usingStream(get_more) elif hasattr(json, "__call__"): json = List_usingStream(json) elif isinstance(json, GeneratorType): json = List_usingStream(json.next) else: Log.error( "Expecting json to be a stream, or a function that will return more bytes" ) def _iterate_list(index, c, parent_path, path, expected_vars): c, index = skip_whitespace(index) if c == b']': yield index return while True: if not path: index = _assign_token(index, c, expected_vars) c, index = skip_whitespace(index) if c == b']': yield index _done(parent_path) return elif c == b',': yield index c, index = skip_whitespace(index) else: for index in _decode_token(index, c, parent_path, path, expected_vars): c, index = skip_whitespace(index) if c == b']': yield index _done(parent_path) return elif c == b',': yield index c, index = skip_whitespace(index) def _done(parent_path): if len(parent_path) < len(done[0]): done[0] = parent_path def _decode_object(index, c, parent_path, query_path, expected_vars): if "." in expected_vars: if len(done[0]) <= len(parent_path) and all( d == p for d, p in zip(done[0], parent_path)): Log.error("Can not pick up more variables, iterator is done") if query_path: Log.error("Can not extract objects that contain the iteration", var=join_field(query_path)) index = _assign_token(index, c, expected_vars) # c, index = skip_whitespace(index) yield index return did_yield = False while True: c, index = skip_whitespace(index) if c == b',': continue elif c == b'"': name, index = simple_token(index, c) c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed(name, expected_vars) child_path = parent_path + [name] if any(child_expected): if not query_path: index = _assign_token(index, c, child_expected) elif query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): did_yield = True yield index else: if len(done[0]) <= len(child_path): Log.error( "Can not pick up more variables, iterator over {{path}} is done", path=join_field(done[0])) index = _assign_token(index, c, child_expected) elif query_path and query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): yield index else: index = jump_to_end(index, c) elif c == b"}": if not did_yield: yield index break def set_destination(expected_vars, value): for i, e in enumerate(expected_vars): if e is None: pass elif e == ".": destination[i] = value elif is_data(value): destination[i] = value[e] else: destination[i] = Null def _decode_object_items(index, c, parent_path, query_path, expected_vars): """ ITERATE THROUGH THE PROPERTIES OF AN OBJECT """ c, index = skip_whitespace(index) num_items = 0 while True: if c == b',': c, index = skip_whitespace(index) elif c == b'"': name, index = simple_token(index, c) if "name" in expected_vars: for i, e in enumerate(expected_vars): if e == "name": destination[i] = name c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed("value", expected_vars) index = _assign_token(index, c, child_expected) c, index = skip_whitespace(index) DEBUG and not num_items % 1000 and Log.note( "{{num}} items iterated", num=num_items) yield index num_items += 1 elif c == b"}": break def _decode_token(index, c, parent_path, query_path, expected_vars): if c == b'{': if query_path and query_path[0] == "$items": if any(expected_vars): for index in _decode_object_items(index, c, parent_path, query_path[1:], expected_vars): yield index else: index = jump_to_end(index, c) yield index elif not any(expected_vars): index = jump_to_end(index, c) yield index else: for index in _decode_object(index, c, parent_path, query_path, expected_vars): yield index elif c == b'[': for index in _iterate_list(index, c, parent_path, query_path, expected_vars): yield index else: index = _assign_token(index, c, expected_vars) yield index def _assign_token(index, c, expected_vars): if not any(expected_vars): return jump_to_end(index, c) value, index = simple_token(index, c) set_destination(expected_vars, value) return index def jump_to_end(index, c): """ DO NOT PROCESS THIS JSON OBJECT, JUST RETURN WHERE IT ENDS """ if c == b'"': while True: c = json[index] index += 1 if c == b'\\': index += 1 elif c == b'"': break return index elif c not in b"[{": while True: c = json[index] index += 1 if c in b',]}': break return index - 1 # OBJECTS AND ARRAYS ARE MORE INVOLVED stack = [None] * 1024 stack[0] = CLOSE[c] i = 0 # FOR INDEXING THE STACK while True: c = json[index] index += 1 if c == b'"': while True: c = json[index] index += 1 if c == b'\\': index += 1 elif c == b'"': break elif c in b'[{': i += 1 stack[i] = CLOSE[c] elif c == stack[i]: i -= 1 if i == -1: return index # FOUND THE MATCH! RETURN elif c in b']}': Log.error("expecting {{symbol}}", symbol=stack[i]) def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": json.mark(index - 1) index = jump_to_end(index, c) value = wrap(json_decoder(json.release(index).decode("utf8"))) return value, index elif c == b"t" and json.slice(index, index + 3) == b"rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == b"ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == b"alse": return False, index + 4 else: json.mark(index - 1) while True: c = json[index] if c in b',]}': break index += 1 text = json.release(index) try: return float(text), index except Exception: Log.error("Not a known JSON primitive: {{text|quote}}", text=text) def skip_whitespace(index): """ RETURN NEXT NON-WHITESPACE CHAR, AND ITS INDEX """ c = json[index] while c in WHITESPACE: index += 1 c = json[index] return c, index + 1 if is_data(query_path) and query_path.get("items"): path_list = split_field(query_path.get("items")) + [ "$items" ] # INSERT A MARKER SO THAT OBJECT IS STREAM DECODED else: path_list = split_field(query_path) destination = [None] * len(expected_vars) c, index = skip_whitespace(0) done = [path_list + [None]] for _ in _decode_token(index, c, [], path_list, expected_vars): output = Data() for i, e in enumerate(expected_vars): output[e] = destination[i] yield output
def __init__(self, db): self.db = db self._snowflakes = Data( ) # MAP FROM BASE TABLE TO LIST OF NESTED PATH TUPLES self.columns = ColumnList(db)
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] es_query.aggs[key].percentiles.tdigest.compression = 2 s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v);', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") elif len(columns) <1: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function({"null":{}}) else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()', }} if schema.query_path[0] == ".": es_query.aggs[canonical_name] = script s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") else: es_query.aggs[canonical_name] = { "nested": {"path": schema.query_path[0]}, "aggs": {"_nested": script} } s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def request(method, url, zip=None, retry=None, **kwargs): """ JUST LIKE requests.request() BUT WITH DEFAULT HEADERS AND FIXES DEMANDS data IS ONE OF: * A JSON-SERIALIZABLE STRUCTURE, OR * LIST OF JSON-SERIALIZABLE STRUCTURES, OR * None Parameters * zip - ZIP THE REQUEST BODY, IF BIG ENOUGH * json - JSON-SERIALIZABLE STRUCTURE * retry - {"times": x, "sleep": y} STRUCTURE THE BYTE_STRINGS (b"") ARE NECESSARY TO PREVENT httplib.py FROM **FREAKING OUT** IT APPEARS requests AND httplib.py SIMPLY CONCATENATE STRINGS BLINDLY, WHICH INCLUDES url AND headers """ global _warning_sent if not default_headers and not _warning_sent: _warning_sent = True Log.warning( "The pyLibrary.env.http module was meant to add extra " "default headers to all requests, specifically the 'Referer' " "header with a URL to the project. Use the `pyLibrary.debug.constants.set()` " "function to set `pyLibrary.env.http.default_headers`" ) if isinstance(url, list): # TRY MANY URLS failures = [] for remaining, u in jx.countdown(url): try: response = request(method, u, zip=zip, retry=retry, **kwargs) if Math.round(response.status_code, decimal=-2) not in [400, 500]: return response if not remaining: return response except Exception as e: e = Except.wrap(e) failures.append(e) Log.error("Tried {{num}} urls", num=len(url), cause=failures) if b"session" in kwargs: session = kwargs[b"session"] del kwargs[b"session"] else: session = sessions.Session() session.headers.update(default_headers) if zip is None: zip = ZIP_REQUEST if isinstance(url, unicode): # httplib.py WILL **FREAK OUT** IF IT SEES ANY UNICODE url = url.encode("ascii") _to_ascii_dict(kwargs) timeout = kwargs[b'timeout'] = coalesce(kwargs.get(b'timeout'), default_timeout) if retry == None: retry = Data(times=1, sleep=0) elif isinstance(retry, Number): retry = Data(times=retry, sleep=1) else: retry = wrap(retry) if isinstance(retry.sleep, Duration): retry.sleep = retry.sleep.seconds set_default(retry, {"times": 1, "sleep": 0}) if b'json' in kwargs: kwargs[b'data'] = convert.value2json(kwargs[b'json']).encode("utf8") del kwargs[b'json'] try: headers = kwargs[b"headers"] = unwrap(coalesce(wrap(kwargs)[b"headers"], {})) set_default(headers, {b"accept-encoding": b"compress, gzip"}) if zip and len(coalesce(kwargs.get(b"data"))) > 1000: compressed = convert.bytes2zip(kwargs[b"data"]) headers[b'content-encoding'] = b'gzip' kwargs[b"data"] = compressed _to_ascii_dict(headers) else: _to_ascii_dict(headers) except Exception as e: Log.error("Request setup failure on {{url}}", url=url, cause=e) errors = [] for r in range(retry.times): if r: Till(seconds=retry.sleep).wait() try: if DEBUG: Log.note("http {{method}} to {{url}}", method=method, url=url) return session.request(method=method, url=url, **kwargs) except Exception as e: errors.append(Except.wrap(e)) if " Read timed out." in errors[0]: Log.error("Tried {{times}} times: Timeout failure (timeout was {{timeout}}", timeout=timeout, times=retry.times, cause=errors[0]) else: Log.error("Tried {{times}} times: Request failure of {{url}}", url=url, times=retry.times, cause=errors[0])