def get_columns(data, leaves=False): # TODO Split this into two functions if not leaves: return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)]) else: return wrap([{ "name": leaf } for leaf in set(leaf for row in data for leaf, _ in row.leaves())])
def vars(self, exclude_where=False, exclude_select=False): """ :return: variables in query """ def edges_get_all_vars(e): output = set() if isinstance(e.value, basestring): output.add(e.value) if e.domain.key: output.add(e.domain.key) if e.domain.where: output |= jx_expression(e.domain.where).vars() if e.range: output |= jx_expression(e.range.min).vars() output |= jx_expression(e.range.max).vars() if e.domain.partitions: for p in e.domain.partitions: if p.where: output |= p.where.vars() return output output = set() try: output |= self.frum.vars() except Exception: pass if not exclude_select: for s in listwrap(self.select): output |= s.value.vars() for s in listwrap(self.edges): output |= edges_get_all_vars(s) for s in listwrap(self.groupby): output |= edges_get_all_vars(s) if not exclude_where: output |= self.where.vars() for s in listwrap(self.sort): output |= s.value.vars() try: output |= UNION(e.vars() for e in self.window) except Exception: pass return output
def extract_rows(es, es_query, source, select, query): with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits for i, s in enumerate(select.copy()): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": try: column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"] or c.useSource) and not c.depth) except Exception, e: Log.warning("can not get columns", e) column_names = UNION(*[[k for k, v in row.items()] for row in T.select(source)]) column_names -= set(select.name) select = select[:i:] + [{ "name": n, "value": n } for n in column_names] + select[i + 1::] break
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop): for not_summarized in todo: if please_stop: return True # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION? Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name) dups = http.post_json(settings.url, json={ "from": "coverage", "select": [ {"name": "max_id", "value": "etl.source.id", "aggregate": "max"}, {"name": "min_id", "value": "etl.source.id", "aggregate": "min"} ], "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url" ], "limit": 100000, "format": "list" }) dups_found = False for d in dups.data: if d.max_id != d.min_id: dups_found = True Log.note( "removing dups {{details|json}}\n{{dups|json|indent}}", details={ "id": int(d.max_id), "test": d.test.url, "source": not_summarized.source.file.name, "revision": not_summarized.build.revision12 } ) # FIND ALL INDEXES all_indexes = [ p.index for p in coverage_index.cluster.get_aliases() if p.alias == coverage_index.settings.alias ] for index_name in all_indexes: elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [ {"not": {"term": {"etl.source.id": int(d.max_id)}}}, {"term": {"test.url": d.test.url}}, {"term": {"source.file.name": not_summarized.source.file.name}}, {"term": {"build.revision12": not_summarized.build.revision12}} ]}) if dups_found: continue # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED test_count = http.post_json(settings.url, json={ "from": "coverage.source.file.covered", "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url", "line" ], "limit": 100000, "format": "list" }) all_tests_covering_file = UNION(test_count.data.get("test.url")) num_tests = len(all_tests_covering_file) max_siblings = num_tests - 1 Log.note( "{{filename}} rev {{revision}} is covered by {{num}} tests", filename=not_summarized.source.file.name, num=num_tests, revision=not_summarized.build.revision12 ) line_summary = list( (k, unwrap(wrap(list(v)).get("test.url"))) for k, v in jx.groupby(test_count.data, keys="line") ) # PULL THE RAW RECORD FOR MODIFICATION file_level_coverage_records = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"in": {"test.url": all_tests_covering_file}}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }} ]}, "limit": 100000, "format": "list" }) for test_name in all_tests_covering_file: siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names] min_siblings = MIN(siblings) coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name) if coverage_candidates: if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates): Log.warning( "Duplicate coverage\n{{cov|json|indent}}", cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates] ) # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE for coverage_record in coverage_candidates: coverage_record.source.file.max_test_siblings = max_siblings coverage_record.source.file.min_line_siblings = min_siblings coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1) else: example = http.post_json(settings.url, json={ "from": "coverage", "where": {"eq": { "test.url": test_name, "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, "limit": 1, "format": "list" }) Log.warning( "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}", test=test_name, file=not_summarized.source.file.name, revision=not_summarized.build.revision12, example=example.data[0] ) bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None] if bad_example: Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0]) rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data] coverage_summary_index.extend(rows) coverage_index.extend(rows) all_test_summary = [] for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"): cov = UNION(records.source.file.covered) uncov = UNION(records.source.file.uncovered) coverage = { "_id": "|".join([records[0].build.revision12, g["source.file.name"]]), # SOMETHING UNIQUE, IN CASE WE RECALCULATE "source": { "file": { "name": g["source.file.name"], "is_file": True, "covered": jx.sort(cov, "line"), "uncovered": jx.sort(uncov), "total_covered": len(cov), "total_uncovered": len(uncov), "min_line_siblings": 0 # PLACEHOLDER TO INDICATE DONE } }, "build": records[0].build, "repo": records[0].repo, "run": records[0].run, "etl": {"timestamp": Date.now()} } all_test_summary.append(coverage) sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary] coverage_summary_index.extend(sum_rows) if DEBUG: coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"eq": {"source.file.name": not_summarized.source.file.name}}, {"eq": {"build.revision12": not_summarized.build.revision12}} ]}, "format": "list", "limit": 10 }) if todo.data: Log.error("Failure to update")
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=lambda: windows.name2accumulator.get(s.aggregate)(**s) ) for s in select } where = jx_expression_to_function(query.where) coord = [None]*len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION(e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube output = Cube(select, query.edges, result) return output
def get_columns(data): return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])