示例#1
0
文件: jx.py 项目: davehunt/ActiveData
def get_columns(data, leaves=False):
    # TODO Split this into two functions
    if not leaves:
        return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
    else:
        return wrap([{
            "name": leaf
        } for leaf in set(leaf for row in data for leaf, _ in row.leaves())])
示例#2
0
    def vars(self, exclude_where=False, exclude_select=False):
        """
        :return: variables in query
        """
        def edges_get_all_vars(e):
            output = set()
            if isinstance(e.value, basestring):
                output.add(e.value)
            if e.domain.key:
                output.add(e.domain.key)
            if e.domain.where:
                output |= jx_expression(e.domain.where).vars()
            if e.range:
                output |= jx_expression(e.range.min).vars()
                output |= jx_expression(e.range.max).vars()
            if e.domain.partitions:
                for p in e.domain.partitions:
                    if p.where:
                        output |= p.where.vars()
            return output

        output = set()
        try:
            output |= self.frum.vars()
        except Exception:
            pass

        if not exclude_select:
            for s in listwrap(self.select):
                output |= s.value.vars()
        for s in listwrap(self.edges):
            output |= edges_get_all_vars(s)
        for s in listwrap(self.groupby):
            output |= edges_get_all_vars(s)
        if not exclude_where:
            output |= self.where.vars()
        for s in listwrap(self.sort):
            output |= s.value.vars()

        try:
            output |= UNION(e.vars() for e in self.window)
        except Exception:
            pass

        return output
示例#3
0
def extract_rows(es, es_query, source, select, query):
    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits
    for i, s in enumerate(select.copy()):
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if s.value == "*":
            try:
                column_names = set(c.name for c in query.frum.get_columns()
                                   if (c.type not in ["object"] or c.useSource)
                                   and not c.depth)
            except Exception, e:
                Log.warning("can not get columns", e)
                column_names = UNION(*[[k for k, v in row.items()]
                                       for row in T.select(source)])
            column_names -= set(select.name)
            select = select[:i:] + [{
                "name": n,
                "value": n
            } for n in column_names] + select[i + 1::]
            break
示例#4
0
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop):
    for not_summarized in todo:
        if please_stop:
            return True

        # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION?
        Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name)
        dups = http.post_json(settings.url, json={
            "from": "coverage",
            "select": [
                {"name": "max_id", "value": "etl.source.id", "aggregate": "max"},
                {"name": "min_id", "value": "etl.source.id", "aggregate": "min"}
            ],
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url"
            ],
            "limit": 100000,
            "format": "list"
        })

        dups_found = False
        for d in dups.data:
            if d.max_id != d.min_id:
                dups_found = True

                Log.note(
                    "removing dups {{details|json}}\n{{dups|json|indent}}",
                    details={
                        "id": int(d.max_id),
                        "test": d.test.url,
                        "source": not_summarized.source.file.name,
                        "revision": not_summarized.build.revision12
                    }
                )

                # FIND ALL INDEXES
                all_indexes = [
                    p.index
                    for p in coverage_index.cluster.get_aliases()
                    if p.alias == coverage_index.settings.alias
                ]
                for index_name in all_indexes:
                    elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [
                        {"not": {"term": {"etl.source.id": int(d.max_id)}}},
                        {"term": {"test.url": d.test.url}},
                        {"term": {"source.file.name": not_summarized.source.file.name}},
                        {"term": {"build.revision12": not_summarized.build.revision12}}
                    ]})
        if dups_found:
            continue

        # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED
        test_count = http.post_json(settings.url, json={
            "from": "coverage.source.file.covered",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url",
                "line"
            ],
            "limit": 100000,
            "format": "list"
        })

        all_tests_covering_file = UNION(test_count.data.get("test.url"))
        num_tests = len(all_tests_covering_file)
        max_siblings = num_tests - 1
        Log.note(
            "{{filename}} rev {{revision}} is covered by {{num}} tests",
            filename=not_summarized.source.file.name,
            num=num_tests,
            revision=not_summarized.build.revision12
        )
        line_summary = list(
            (k, unwrap(wrap(list(v)).get("test.url")))
            for k, v in jx.groupby(test_count.data, keys="line")
        )

        # PULL THE RAW RECORD FOR MODIFICATION
        file_level_coverage_records = http.post_json(settings.url, json={
            "from": "coverage",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"in": {"test.url": all_tests_covering_file}},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }}
            ]},
            "limit": 100000,
            "format": "list"
        })

        for test_name in all_tests_covering_file:
            siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names]
            min_siblings = MIN(siblings)
            coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name)
            if coverage_candidates:

                if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates):
                    Log.warning(
                        "Duplicate coverage\n{{cov|json|indent}}",
                        cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates]
                    )

                # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE
                for coverage_record in coverage_candidates:
                    coverage_record.source.file.max_test_siblings = max_siblings
                    coverage_record.source.file.min_line_siblings = min_siblings
                    coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1)
            else:
                example = http.post_json(settings.url, json={
                    "from": "coverage",
                    "where": {"eq": {
                        "test.url": test_name,
                        "source.file.name": not_summarized.source.file.name,
                        "build.revision12": not_summarized.build.revision12
                    }},
                    "limit": 1,
                    "format": "list"
                })

                Log.warning(
                    "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}",
                    test=test_name,
                    file=not_summarized.source.file.name,
                    revision=not_summarized.build.revision12,
                    example=example.data[0]
                )

        bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None]
        if bad_example:
            Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0])

        rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data]
        coverage_summary_index.extend(rows)
        coverage_index.extend(rows)

        all_test_summary = []
        for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"):
            cov = UNION(records.source.file.covered)
            uncov = UNION(records.source.file.uncovered)
            coverage = {
                "_id": "|".join([records[0].build.revision12, g["source.file.name"]]),  # SOMETHING UNIQUE, IN CASE WE RECALCULATE
                "source": {
                    "file": {
                        "name": g["source.file.name"],
                        "is_file": True,
                        "covered": jx.sort(cov, "line"),
                        "uncovered": jx.sort(uncov),
                        "total_covered": len(cov),
                        "total_uncovered": len(uncov),
                        "min_line_siblings": 0  # PLACEHOLDER TO INDICATE DONE
                    }
                },
                "build": records[0].build,
                "repo": records[0].repo,
                "run": records[0].run,
                "etl": {"timestamp": Date.now()}
            }
            all_test_summary.append(coverage)

        sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary]
        coverage_summary_index.extend(sum_rows)

        if DEBUG:
            coverage_index.refresh()
            todo = http.post_json(settings.url, json={
                "from": "coverage",
                "where": {"and": [
                    {"missing": "source.method.name"},
                    {"missing": "source.file.min_line_siblings"},
                    {"eq": {"source.file.name": not_summarized.source.file.name}},
                    {"eq": {"build.revision12": not_summarized.build.revision12}}
                ]},
                "format": "list",
                "limit": 10
            })
            if todo.data:
                Log.error("Failure to update")
示例#5
0
def list_aggs(frum, query):
    frum = wrap(frum)
    select = listwrap(query.select)

    for e in query.edges:
        if isinstance(e.domain, DefaultDomain):
            accessor = jx_expression_to_function(e.value)
            unique_values = set(map(accessor, frum))
            if None in unique_values:
                e.allowNulls = coalesce(e.allowNulls, True)
                unique_values -= {None}
            e.domain = SimpleSetDomain(partitions=list(sorted(unique_values)))
        else:
            pass

    s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select]

    result = {
        s.name: Matrix(
            dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges],
            zeros=lambda: windows.name2accumulator.get(s.aggregate)(**s)
        )
        for s in select
    }
    where = jx_expression_to_function(query.where)
    coord = [None]*len(query.edges)
    edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)]

    net_new_edge_names = set(wrap(query.edges).name) - UNION(e.value.vars() for e in query.edges)
    if net_new_edge_names & UNION(ss.value.vars() for ss in select):
        # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY
        for d in filter(where, frum):
            d = d.copy()
            for c, get_matches in edge_accessor:
                coord[c] = get_matches(d)

            for s_name, s_accessor in s_accessors:
                mat = result[s_name]
                for c in itertools.product(*coord):
                    acc = mat[c]
                    for e, cc in zip(query.edges, c):
                        d[e.name] = e.domain.partitions[cc]
                    val = s_accessor(d, c, frum)
                    acc.add(val)
    else:
        # FASTER
        for d in filter(where, frum):
            for c, get_matches in edge_accessor:
                coord[c] = get_matches(d)

            for s_name, s_accessor in s_accessors:
                mat = result[s_name]
                for c in itertools.product(*coord):
                    acc = mat[c]
                    val = s_accessor(d, c, frum)
                    acc.add(val)

    for s in select:
        # if s.aggregate == "count":
        #     continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    from pyLibrary.queries.containers.cube import Cube

    output = Cube(select, query.edges, result)
    return output
示例#6
0
def get_columns(data):
    return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])