def sort_table(result): """ SORT ROWS IN TABLE, EVEN IF ELEMENTS ARE JSON """ data = wrap([{unicode(i): v for i, v in enumerate(row) if v != None} for row in result.data]) sort_columns = jx.sort(set(jx.get_columns(data, leaves=True).name)) data = jx.sort(data, sort_columns) result.data = [tuple(row[unicode(i)] for i in range(len(result.header))) for row in data]
def compare_to_expected(query, result, expect): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": assertAlmostEqual(set(result.header), set(expect.header)) # MAP FROM expected COLUMN TO result COLUMN mapping = zip(*zip(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate( result.header))))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = zip(*unwrap(result.data)) result.data = zip(*[columns[m] for m in mapping]) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if query["from"].startswith("meta."): pass else: query = QueryOp.wrap(query) if not query.sort: try: #result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort( set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception, _: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])}) command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in columns]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with insert", e)
def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e)
def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [ c for c in self.meta.columns.data if c.table == table_name and ( column_name is None or c.name == column_name) ] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all( columns.get("last_updated")): Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e)
def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e)
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in columns]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with insert", e)
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null accessor = jx_expression_to_function(TupleOp( "tuple", keys)) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception, e: Log.error("Problem grouping", cause=e)
def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta)
def done_count(self): columns = map(unicode, range(len(self.fields))) parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = jx.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)] )
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null if any(isinstance(k, Expression) for k in keys): Log.error("can not handle expressions") else: accessor = jx_expression_to_function(jx_expression({"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception as e: Log.error("Problem grouping", cause=e)
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es09.util.post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = DictList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def _get_best(self, settings): from pyLibrary.queries import jx aliases = self.get_aliases() indexes = jx.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or (a.index == settings.index and (a.alias == None or a.alias == settings.alias)) ], "index") return indexes.last()
def _get_best(self, settings): from pyLibrary.queries import jx aliases = self.get_aliases() indexes = jx.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or (a.index == settings.index and (settings.alias == None or a.alias == None or a.alias == settings.alias)) ], "index") return indexes.last()
def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=None, timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def running_instances(self): # FIND THE BIGGEST, MOST EXPENSIVE REQUESTS instances = self._get_managed_instances() for r in instances: try: r.markup = self.price_lookup[r.instance_type, r.placement] except Exception as e: r.markup = self.price_lookup[r.instance_type, r.placement] Log.error("No pricing!!!", e) instances = jx.sort(instances, [ {"value": "markup.type.utility", "sort": -1}, {"value": "markup.estimated_value", "sort": 1} ]) return instances
def json_schema_to_markdown(schema): from pyLibrary.queries import jx def _md_code(code): return "`" + code + "`" def _md_italic(value): return "*" + value + "*" def _inner(schema, parent_name, indent): more_lines = [] for k, v in schema.items(): full_name = concat_field(parent_name, k) details = indent + "* " + _md_code(full_name) if v.type: details += " - " + _md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend( _inner(v.properties, full_name, indent + " ")) return more_lines lines = [] if schema.title: lines.append("#" + schema.title) lines.append(schema.description) lines.append("") for k, v in jx.sort(schema.properties.items(), 0): full_name = k if v.type in ["object", "array", "nested"]: lines.append("##" + _md_code(full_name) + " Property") if v.description: lines.append(v.description) lines.append("") if v.type in ["object", "array", "nested"]: lines.extend(_inner(v.properties, full_name, " ")) else: lines.append("##" + _md_code(full_name) + " (" + v.type + ")") if v.description: lines.append(v.description) return "\n".join(lines)
def json_schema_to_markdown(schema): from pyLibrary.queries import jx def _md_code(code): return "`"+code+"`" def _md_italic(value): return "*"+value+"*" def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = join_field(split_field(parent_name)+[k]) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines lines = [] if schema.title: lines.append("#"+schema.title) lines.append(schema.description) lines.append("") for k, v in jx.sort(schema.properties.items(), 0): full_name = k if v.type in ["object", "array", "nested"]: lines.append("##"+_md_code(full_name)+" Property") if v.description: lines.append(v.description) lines.append("") if v.type in ["object", "array", "nested"]: lines.extend(_inner(v.properties, full_name, " ")) else: lines.append("##"+_md_code(full_name)+" ("+v.type+")") if v.description: lines.append(v.description) return "\n".join(lines)
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) kwargs=None ): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index= kwargs.index) if alias == None: Log.error("Alias can not be None") self.settings = kwargs self.cluster = Cluster(kwargs) if type == None: if not explore_metadata: Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.") if not self.settings.alias or self.settings.alias==self.settings.index: alias_list = self.cluster.get("/_alias") candidates = ( [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] + [(name, Null) for name, i in alias_list.items() if self.settings.index==name] ) full_name = jx.sort(candidates, 0).last()[0] if not full_name: Log.error("No index by name of {{name}}", name=self.settings.index) mappings = self.cluster.get("/" + full_name + "/_mapping")[full_name] else: mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in mappings.mappings.items(): if _type == "_default_": continue num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def get_markup(self, branch, revision, task_id=None, buildername=None, timestamp=None): # TRY CACHE if not branch or not revision: Log.error("expecting branch and revision") if self.settings.use_cache: if task_id: _filter = {"term": {"task.id": task_id}} else: _filter = {"term": {"ref_data_name": buildername}} query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ _filter, {"term": {"repo.branch": branch}}, {"prefix": {"repo.revision": revision}}, {"or": [ {"range": {"etl.timestamp": {"gte": (Date.now() - HOUR).unix}}}, {"range": {"job.timing.last_modified": {"lt": (Date.now() - DAY).unix}}} ]} ]} }}, "size": 10000 } try: docs = self.cache.search(query, timeout=120).hits.hits except Exception, e: docs = None Log.warning("Bad ES call, fall back to TH", cause=e) if not docs: pass elif len(docs) == 1: if DEBUG: Log.note("Used ES cache to get TH details on {{value|quote}}", value=coalesce(task_id, buildername)) return docs[0]._source elif timestamp == None: Log.error("timestamp required to find best match") else: # MISSING docs._source.job.timing.end WHEN A PLACEHOLDER WAS ADDED # TODO: SHOULD DELETE OVERAPPING PLACEHOLDER RECORDS timestamp = Date(timestamp).unix best_index = jx.sort([(i, abs(coalesce(e, 0) - timestamp)) for i, e in enumerate(docs._source.job.timing.end)], 1)[0][0] return docs[best_index]._source
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) kwargs=None ): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index= kwargs.index) if alias == None: Log.error("Alias can not be None") self.settings = kwargs self.cluster = Cluster(kwargs) if type == None: if not explore_metadata: Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.") if not self.settings.alias or self.settings.alias==self.settings.index: alias_list = self.cluster.get("/_alias") candidates = ( [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] + [(name, Null) for name, i in alias_list.items() if self.settings.index==name] ) full_name = jx.sort(candidates, 0).last()[0] mappings = self.cluster.get("/" + full_name + "/_mapping")[full_name] else: mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in mappings.mappings.items(): if _type == "_default_": continue num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM self.sorted = None edge_var = edge.value.vars() for s in query.sort: if not edge_var - s.value.vars(): self.sorted = {1: "asc", -1: "desc"}[s.sort] domain = self.domain key = self.domain.key domain.partitions = parts = jx.sort(domain.partitions, { "value": key, "sort": s.sort }) domain.map = {i: p for i, p in enumerate(parts)}
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [c for c in self.meta.columns.data if c.table == table_name and (column_name is None or c.name==column_name)] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e)
def _figure_out_start_point(self): # RECOVER FROM THE QUEUE acc = [] while True: d = self.temp_queue.pop(timeout=ZERO) if d: acc.append(d) else: break self.temp_queue.rollback() if acc: # WAS IN THE MIDDLE OF A BATCH, FIND count data = acc[-1] today_ = data[UID_PATH].split(".")[0] todays_batch_count = int(data[UID_PATH].split(".")[1]) count = todays_batch_count * BATCH_SIZE + data.etl.id + 1 if DEBUG: Log.note( "Next uid from queue is {{uid}}.{{count}}", count=count % BATCH_SIZE, uid=today_ + "." + unicode(todays_batch_count) ) self.uid = UID(count) return # FIND LAST WHOLE BATCH FROM TODAY today_ = unicode(today()) todays_keys = self.bucket.keys(prefix=unicode(today_)) if not todays_keys: if DEBUG: Log.note("Next uid is {{uid}}.{{count}}", count=0, uid=today_+".0") self.uid = UID() return todays_batch_count = jx.sort(int(k.split(".")[1]) for k in todays_keys).last() + 1 max_key = today_ + "." + unicode(todays_batch_count) if DEBUG: Log.note("Next uid is {{uid}}", uid=max_key) count = todays_batch_count * BATCH_SIZE self.uid = UID(count)
def get_metadata(self, force=False): if not self.settings.explore_metadata: Log.error("Metadata exploration has been disabled") if not self._metadata or force: response = self.get("/_cluster/state", retry={"times": 5}, timeout=3) with self.metadata_locker: self._metadata = wrap(response.metadata) # REPLICATE MAPPING OVER ALL ALIASES indices = self._metadata.indices for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}): m.index = i for a in m.aliases: if not indices[a]: indices[a] = {"index": i} self.cluster_state = wrap(self.get("/")) self.version = self.cluster_state.version.number return self._metadata return self._metadata
def get_index(self, alias): """ RETURN THE INDEX USED BY THIS alias """ alias_list = self.cluster.get_aliases() output = jx.sort(set([ a.index for a in alias_list if a.alias == alias or a.index == alias or (re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias) ])) if len(output) > 1: Log.error("only one index with given alias==\"{{alias}}\" expected", alias= alias) if not output: return Null return output.last()
def get_metadata(self, force=False): if not self.settings.explore_metadata: Log.error("Metadata exploration has been disabled") if not self._metadata or force: response = self.get("/_cluster/state", retry={"times": 3}, timeout=30) with self.metadata_locker: self._metadata = wrap(response.metadata) # REPLICATE MAPPING OVER ALL ALIASES indices = self._metadata.indices for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}): m.index = i for a in m.aliases: if not indices[a]: indices[a] = m self.cluster_state = wrap(self.get("/")) self.version = self.cluster_state.version.number return self._metadata return self._metadata
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = jx.sort(keys) try: command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in keys]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with record: {{record}}", record= records, cause=e)
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE index = "dummy value" schema = wrap({"_routing": {}, "properties": {}}) for _, ind in jx.sort(candidates, {"value": 0, "sort": -1}): mapping = ind.mappings[self.settings.type] set_default(schema._routing, mapping._routing) schema.properties = _merge_mapping(schema.properties, mapping.properties) else: #FULLY DEFINED settings index = indices[self.settings.index] schema = index.mappings[self.settings.type] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) #TODO: REMOVE THIS BUG CORRECTION if not schema and self.settings.type == "test_result": schema = index.mappings["test_results"] # DONE BUG CORRECTION if not schema: Log.error( "ElasticSearch index ({{index}}) does not have type ({{type}})", index=self.settings.index, type=self.settings.type ) return schema else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def save_money(self, remaining_budget, net_new_utility): remove_spot_requests = wrap([]) # FIRST CANCEL THE PENDING REQUESTS if remaining_budget < 0: requests = self._get_managed_spot_requests() for r in requests: if r.status.code in PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN: remove_spot_requests.append(r.id) net_new_utility += self.settings.utility[r.launch_specification.instance_type].utility remaining_budget += r.price instances = jx.sort(self.running_instances(), "markup.estimated_value") remove_list = wrap([]) for s in instances: if remaining_budget >= 0: break remove_list.append(s) net_new_utility += coalesce(s.markup.type.utility, 0) remaining_budget += coalesce(s.request.bid_price, s.markup.price_80, s.markup.current_price) if not remove_list: return remaining_budget, net_new_utility # SEND SHUTDOWN TO EACH INSTANCE Log.warning("Shutdown {{instances}} to save money!", instances=remove_list.id) for i in remove_list: try: self.instance_manager.teardown(i) except Exception as e: Log.warning("Teardown of {{id}} failed", id=i.id, cause=e) remove_spot_requests.extend(remove_list.spot_instance_request_id) # TERMINATE INSTANCES self.ec2_conn.terminate_instances(instance_ids=remove_list.id) # TERMINATE SPOT REQUESTS self.ec2_conn.cancel_spot_instance_requests(request_ids=remove_spot_requests) return remaining_budget, net_new_utility
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = jx.sort(keys) try: command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in keys]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with record: {{record}}", record=records, cause=e)
def loop_all_days(destination, please_stop): try: today = Date.today() # WHICH DAYS DO WE NEED TO CALCULATE # ALL BUILD DATES WITH WITH ETL TIMESTAMP OF A WEEK AGO # ALL BUILD DATES THAT HAVE NOT BEEN PROCESSED YET build_dates = http.post_json(config.source.url, json={ "from": "unittest", "edges": [ { "name": "date", "value": "build.date", "allowNulls": False, "domain": { "type": "time", "min": "today-week", "max": "eod", "interval": "day" } } ], "where": {"gte": {"etl.timestamp": (today - WEEK).unix}}, "sort": {"value": "build.date", "sort": -1}, "limit": 14, "format": "list" }) build_dates.data = jx.sort(build_dates.data, {"value": "date", "sort": -1}) for d in build_dates.data: if please_stop: return agg(Date(d.date), destination, please_stop=please_stop) finally: please_stop.go()
def int_list_packer(term, values): """ return singletons, ranges and exclusions """ DENSITY = 10 # a range can have holes, this is inverse of the hole density MIN_RANGE = 20 # min members before a range is allowed to be used singletons = set() ranges = [] exclude = set() sorted = jx.sort(values) last = sorted[0] curr_start = last curr_excl = set() for v in sorted[1::]: if v <= last + 1: pass elif v - last > 3: # big step, how do we deal with it? if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() else: if 1 + last - curr_start >= len(curr_excl) * DENSITY: # high density, keep track of excluded and continue add_me = set(range(last + 1, v)) curr_excl |= add_me elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE: # not big enough, convert range to singletons new_singles = set(range(curr_start, last + 1)) - curr_excl singletons = singletons | new_singles curr_start = v curr_excl = set() else: ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() last = v if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl if ranges: r = {"or": [{"range": {term: r}} for r in ranges]} if exclude: r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]} if singletons: return {"or": [ {"terms": {term: jx.sort(singletons)}}, r ]} else: return r else: raise Except("no packing possible")
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop): for not_summarized in todo: if please_stop: return True # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION? Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name) dups = http.post_json(settings.url, json={ "from": "coverage", "select": [ {"name": "max_id", "value": "etl.source.id", "aggregate": "max"}, {"name": "min_id", "value": "etl.source.id", "aggregate": "min"} ], "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url" ], "limit": 100000, "format": "list" }) dups_found = False for d in dups.data: if d.max_id != d.min_id: dups_found = True Log.note( "removing dups {{details|json}}\n{{dups|json|indent}}", details={ "id": int(d.max_id), "test": d.test.url, "source": not_summarized.source.file.name, "revision": not_summarized.build.revision12 } ) # FIND ALL INDEXES all_indexes = [ p.index for p in coverage_index.cluster.get_aliases() if p.alias == coverage_index.settings.alias ] for index_name in all_indexes: elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [ {"not": {"term": {"etl.source.id": int(d.max_id)}}}, {"term": {"test.url": d.test.url}}, {"term": {"source.file.name": not_summarized.source.file.name}}, {"term": {"build.revision12": not_summarized.build.revision12}} ]}) if dups_found: continue # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED test_count = http.post_json(settings.url, json={ "from": "coverage.source.file.covered", "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url", "line" ], "limit": 100000, "format": "list" }) all_tests_covering_file = UNION(test_count.data.get("test.url")) num_tests = len(all_tests_covering_file) max_siblings = num_tests - 1 Log.note( "{{filename}} rev {{revision}} is covered by {{num}} tests", filename=not_summarized.source.file.name, num=num_tests, revision=not_summarized.build.revision12 ) line_summary = list( (k, unwrap(wrap(list(v)).get("test.url"))) for k, v in jx.groupby(test_count.data, keys="line") ) # PULL THE RAW RECORD FOR MODIFICATION file_level_coverage_records = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"in": {"test.url": all_tests_covering_file}}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }} ]}, "limit": 100000, "format": "list" }) for test_name in all_tests_covering_file: siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names] min_siblings = MIN(siblings) coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name) if coverage_candidates: if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates): Log.warning( "Duplicate coverage\n{{cov|json|indent}}", cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates] ) # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE for coverage_record in coverage_candidates: coverage_record.source.file.max_test_siblings = max_siblings coverage_record.source.file.min_line_siblings = min_siblings coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1) else: example = http.post_json(settings.url, json={ "from": "coverage", "where": {"eq": { "test.url": test_name, "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, "limit": 1, "format": "list" }) Log.warning( "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}", test=test_name, file=not_summarized.source.file.name, revision=not_summarized.build.revision12, example=example.data[0] ) bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None] if bad_example: Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0]) rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data] coverage_summary_index.extend(rows) coverage_index.extend(rows) all_test_summary = [] for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"): cov = UNION(records.source.file.covered) uncov = UNION(records.source.file.uncovered) coverage = { "_id": "|".join([records[0].build.revision12, g["source.file.name"]]), # SOMETHING UNIQUE, IN CASE WE RECALCULATE "source": { "file": { "name": g["source.file.name"], "is_file": True, "covered": jx.sort(cov, "line"), "uncovered": jx.sort(uncov), "total_covered": len(cov), "total_uncovered": len(uncov), "min_line_siblings": 0 # PLACEHOLDER TO INDICATE DONE } }, "build": records[0].build, "repo": records[0].repo, "run": records[0].run, "etl": {"timestamp": Date.now()} } all_test_summary.append(coverage) sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary] coverage_summary_index.extend(sum_rows) if DEBUG: coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"eq": {"source.file.name": not_summarized.source.file.name}}, {"eq": {"build.revision12": not_summarized.build.revision12}} ]}, "format": "list", "limit": 10 }) if todo.data: Log.error("Failure to update")
def sort(self, sort): return ListContainer("from "+self.name, jx.sort(self.data, sort, already_normalized=True), self.schema)
def _edges_op(self, query, frum): index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) tables = [] base_table = split_field(frum)[0] path = join_field(split_field(frum)[1:]) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } columns = self._get_sql_schema(frum) tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = join_field( [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias previous = tables[0] for t in tables[1::]: from_sql += "\nLEFT JOIN\n" + join_field( [base_table] + split_field(t.nest) ) + " " + t.alias + " ON " + t.alias + "." + PARENT + " = " + previous.alias + "." + GUID # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] not_ons = ["__exists__ IS NULL"] groupby = [] not_groupby = [] orderby = [] domains = [] select_clause = [ "1 __exists__" # USED TO DISTINGUISH BETWEEN NULL-BECAUSE-LEFT-JOIN OR NULL-BECAUSE-NULL-VALUE ] for edge_index, query_edge in enumerate(query.edges): edge_alias = "e" + unicode(edge_index) if query_edge.value: edge_values = [ p for c in query_edge.value.to_sql(self).sql for p in c.items() ] elif not query_edge.value and any( query_edge.domain.partitions.where): case = "CASE " for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(self)[0].sql.b t = quote_value(pp) case += " WHEN " + w + " THEN " + t case += " ELSE NULL END " edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(self)[0].sql.items( ) + query_edge.range.max.to_sql(self)[0].sql.items()
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({ "and": [query.where, { "term": { query.edges[0].value: v } }] }) } data = es09.util.post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{ "name": v, "value": v } for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
except Exception, _: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass elif result.meta.format == "cube" and len( result.edges ) == 1 and result.edges[0].name == "rownum" and not query.sort: header = list(result.data.keys()) result.data = cube2list(result.data) result.data = jx.sort(result.data, header) result.data = list2cube(result.data, header) expect.data = cube2list(expect.data) expect.data = jx.sort(expect.data, header) expect.data = list2cube(expect.data, header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=6) def cube2list(c): rows = zip(*[[(k, v) for v in a] for k, a in c.items()]) rows = [dict(r) for r in rows] return rows
def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts)) ) self.parts = None self.computed_domain = True
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "name": c.name } } }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { c.name: _counting_query(c) }, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": { "path": listwrap(c.nested_path)[0] }, "aggs": { "_nested": { "terms": { "field": c.es_column, "size": 0 } } } } else: query.aggs[literal_field(c.name)] = { "terms": { "field": c.es_column, "size": 0 } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) except Exception, e: if "IndexMissingException" in e and c.table.startswith( TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) Log.warning( "Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) Log.warning("Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def sort(self, sort): return ListContainer("from " + self.name, jx.sort(self.data, sort, already_normalized=True), self.schema)
def int_list_packer(term, values): """ return singletons, ranges and exclusions """ DENSITY = 10 # a range can have holes, this is inverse of the hole density MIN_RANGE = 20 # min members before a range is allowed to be used singletons = set() ranges = [] exclude = set() sorted = jx.sort(values) last = sorted[0] curr_start = last curr_excl = set() for v in sorted[1::]: if v <= last + 1: pass elif v - last > 3: # big step, how do we deal with it? if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ( (last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() else: if 1 + last - curr_start >= len(curr_excl) * DENSITY: # high density, keep track of excluded and continue add_me = set(range(last + 1, v)) curr_excl |= add_me elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE: # not big enough, convert range to singletons new_singles = set(range(curr_start, last + 1)) - curr_excl singletons = singletons | new_singles curr_start = v curr_excl = set() else: ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() last = v if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ( (last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl if ranges: r = {"or": [{"range": {term: r}} for r in ranges]} if exclude: r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]} if singletons: return {"or": [{"terms": {term: jx.sort(singletons)}}, r]} else: return r else: raise Except("no packing possible")
def loop(source, coverage_summary_index, settings, please_stop): try: cluster = elasticsearch.Cluster(source) aliases = cluster.get_aliases() candidates = [] for pairs in aliases: if pairs.alias == source.index: candidates.append(pairs.index) candidates = jx.sort(candidates, {".": "desc"}) for index_name in candidates: coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source) push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT) while not please_stop: # IDENTIFY NEW WORK Log.note("Working on index {{index}}", index=index_name) coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "groupby": ["source.file.name", "build.revision12"], "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"gte": {"repo.push.date": push_date_filter}} ]}, "format": "list", "limit": coalesce(settings.batch_size, 100) }) if not todo.data: break queue = Queue("pending source files to review") queue.extend(todo.data[0:coalesce(settings.batch_size, 100):]) threads = [ Thread.run( "processor" + unicode(i), process_batch, queue, coverage_index, coverage_summary_index, settings, please_stop=please_stop ) for i in range(NUM_THREAD) ] # ADD STOP MESSAGE queue.add(Thread.STOP) # WAIT FOR THEM TO COMPLETE for t in threads: t.join() please_stop.go() return except Exception, e: Log.warning("Problem processing", cause=e)
if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception, _: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=places) def cube2list(cube): """ RETURNS header SO THAT THE ORIGINAL CUBE CAN BE RECREATED :param cube: A dict WITH VALUES BEING A MULTIDIMENSIONAL ARRAY OF UNIFORM VALUES :return: (rows, header) TUPLE
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self query = QueryOp.wrap(query, self.columns) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby: op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(self)[0].sql) for s in query.sort] for t in "bns" if sql[t]) result = self.db.query(command) column_names = query.edges.name + query.groupby.name + listwrap( query.select).name if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) return Data(data=unwrap(data), meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data_cubes = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": data[c.push_name] = c.pull(result.data[0]) else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): data[s.push_child] = s.pull(result.data[0]) output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output
def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts))) self.parts = None self.computed_domain = True
if not value: return Null try: return int(value) except Exception: pass try: return float(value) except Exception: pass return value tab_data = File("resources/EC2.csv").read() lines = map(strings.trim, tab_data.split("\n")) header = lines[0].split(",") rows = [r.split(",") for r in lines[1:] if r] data = wrap([{h: unquote(r[c]) for c, h in enumerate(header)} for r in rows]) for d in data: d.utility = Math.min(d.memory, d.storage/50, 60) d.drives["$ref"] = "#" + unicode(d.num_drives) + "_ephemeral_drives" d.discount = 0 Log.note("{{data|json(False)}}", data=[d for d in data if d.utility]) Log.note("{{data|json}}", data={d.instance_type: {"num": d.num_drives, "size": d.storage} for d in jx.sort(data, "instance_type")})
def get_markup(self, branch, revision, task_id=None, buildername=None, timestamp=None): # TRY CACHE if not branch or not revision: Log.error("expecting branch and revision") if self.settings.use_cache: if task_id: _filter = {"term": {"task.id": task_id}} else: _filter = {"term": {"ref_data_name": buildername}} query = { "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [ _filter, { "term": { "repo.branch": branch } }, { "prefix": { "repo.revision": revision } }, { "or": [{ "range": { "etl.timestamp": { "gte": (Date.now() - HOUR).unix } } }, { "range": { "job.timing.last_modified": { "lt": (Date.now() - DAY).unix } } }] } ] } } }, "size": 10000 } try: docs = self.cache.search(query, timeout=120).hits.hits except Exception, e: docs = None Log.warning("Bad ES call, fall back to TH", cause=e) if not docs: pass elif len(docs) == 1: if DEBUG: Log.note( "Used ES cache to get TH details on {{value|quote}}", value=coalesce(task_id, buildername)) return docs[0]._source elif timestamp == None: Log.error("timestamp required to find best match") else: # MISSING docs._source.job.timing.end WHEN A PLACEHOLDER WAS ADDED # TODO: SHOULD DELETE OVERAPPING PLACEHOLDER RECORDS timestamp = Date(timestamp).unix best_index = jx.sort( [(i, abs(coalesce(e, 0) - timestamp)) for i, e in enumerate(docs._source.job.timing.end)], 1)[0][0] return docs[best_index]._source