Пример #1
0
 def filter(self, where):
     if len(self.edges) == 1 and self.edges[0].domain.type == "index":
         # USE THE STANDARD LIST FILTER
         from pyLibrary.queries import jx
         return jx.filter(self.data.values()[0].cube, where)
     else:
         # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS
         Log.unexpected("Incomplete")
Пример #2
0
 def filter(self, where):
     if len(self.edges)==1 and self.edges[0].domain.type=="index":
         # USE THE STANDARD LIST FILTER
         from pyLibrary.queries import jx
         return jx.filter(self.data.values()[0].cube, where)
     else:
         # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS
         Log.unexpected("Incomplete")
Пример #3
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS
    line = line.replace('{"id": "bb"}',
                        '{"code": "bb"}').replace('{"id": "tc"}',
                                                  '{"code": "tc"}')

    # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED
    if source.name.startswith("active-data-test-result"):
        # "suite": {"flavor": "plain-chunked", "name": "mochitest"}
        found = strings.between(line, '"suite": {', '}')
        if found:
            suite_json = '{' + found + "}"
            if suite_json:
                suite = convert.json2value(suite_json)
                suite = convert.value2json(suite.name)
                line = line.replace(suite_json, suite)

    if rownum == 0:
        value = convert.json2value(line)
        if len(line) > 100000:
            value.result.subtests = [
                s for s in value.result.subtests if s.ok is False
            ]
            value.result.missing_subtests = True

        _id, value = _fix(value)
        row = {"id": _id, "value": value}
        if sample_only_filter and Random.int(
                int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter(
                    [value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            return row, True
    elif len(line) > 100000:
        value = convert.json2value(line)
        value.result.subtests = [
            s for s in value.result.subtests if s.ok is False
        ]
        value.result.missing_subtests = True
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    elif line.find("\"resource_usage\":") != -1:
        value = convert.json2value(line)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    else:
        # FAST
        _id = strings.between(line, "\"_id\": \"", "\"")  # AVOID DECODING JSON
        row = {"id": _id, "json": line}

    return row, False
Пример #4
0
    def update(self, command):
        try:
            command = wrap(command)
            eq = command.where.eq
            if eq.es_index:
                columns = self.find(eq.es_index, eq.name)
                columns = [c for c in columns if all(get_attr(c, k) == v for k, v in eq.items())]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            for col in columns:
                for k in command["clear"]:
                    col[k] = None

                for k, v in command.set.items():
                    col[k] = v
        except Exception as e:
            Log.error("sould not happen", cause=e)
Пример #5
0
    def update(self, command):
        try:
            command = wrap(command)
            eq = command.where.eq
            if eq.es_index:
                columns = self.find(eq.es_index, eq.name)
                columns = [c for c in columns if all(get_attr(c, k) == v for k, v in eq.items())]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            for col in columns:
                for k in command["clear"]:
                    col[k] = None

                for k, v in command.set.items():
                    col[k] = v
        except Exception as e:
            Log.error("sould not happen", cause=e)
Пример #6
0
def agg(today, destination, debug_filter=None, please_stop=None):
    """
    :param today:  The day we are performing the calculation for
    :param destination: The ES index where we put the results
    :param debug_filter: Some extra limitation to go faster, and focus, for testing
    :param please_stop: Signal for stopping early
    :return: nothing
    """

    # GET LIST OF ALL TESTS, BY PLATFORM, TYPE, SUITE
    for suite in SUITES:
        domain = {"and": [
            {"prefix": {"run.suite": suite}},
            {"gt": {"build.date": (today - 3 * DAY).unix}},
            {"lt": {"build.date": (today + 4 * DAY).unix}},
            {"exists": "build.platform"},
            {"not": {"in": {"build.platform": EXCLUDE_PLATFORMS}}},
            {"not": {"in": {"build.branch": EXCLUDE_BRANCHES}}}
        ]}
        if debug_filter:
            domain['and'].append(debug_filter)

        _ = convert.value2json("\"\"")

        # WE CAN NOT PULL ALL TESTS, THERE ARE TOO MANY, SO DO ONE SUITE AT A TIME
        Log.note("Get summary of failures in {{suite}} for date {{date}}", suite=suite, date=today)
        suite_summary = http.post_json(config.source.url, json={
            "from": "unittest",
            "groupby": [
                {"name": "test", "value": "result.test"}
            ],
            "where": {"and": [
                domain,
                {"eq": {"result.ok": False}}
            ]},
            "format": "list",
            "limit": 100000
        })

        often_fail = jx.filter(suite_summary.data, {"gt": {"count": 1}})

        for g, tests in jx.groupby(often_fail, size=100):
            tests = wrap(tests)
            if please_stop:
                return

            Log.note("Collect stats on {{num}} tests", num=len(tests))
            tests_summary = http.post_json(config.source.url, json={
                "from": "unittest",
                "groupby": [
                    "run.suite",
                    {"name": "test", "value": "result.test"},
                    "build.platform",
                    "build.product",
                    "build.type",
                    "run.type"
                ],
                "select": [
                    {
                        "name": "date_fails",
                        "value": {
                            "mult": [
                                {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]},
                                {"when": "result.ok", "then": 0, "else": 1}
                            ]
                        },
                        "aggregate": "stats"
                    },
                    {
                        "name": "date",
                        "value": {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]},
                        "aggregate": "stats"
                    },
                    {
                        "name": "fails",
                        "value": {"when": "result.ok", "then": 0, "else": 1},
                        "aggregate": "stats"
                    }
                ],
                "where": {"and": [
                    domain,
                    {"in": {"result.test": tests}}
                ]},
                "format": "list",
                "limit": 100000
            })

            # FOR EACH TEST, CALCULATE THE "RECENTLY BAD" STATISTIC (linear regression slope)
            # THIS IS ONLY A ROUGH CALC FOR TESTING THE UI
            for t in tests_summary.data:
                try:
                    t._id = "-".join([
                        coalesce(t.build.product, ""),
                        t.build.platform,
                        coalesce(t.build.type, ""),
                        coalesce(t.run.type, ""),
                        t.run.suite,
                        t.test,
                        unicode(today.unix)
                    ])
                except Exception, e:
                    Log.error("text join problem", cause=e)
                t.timestamp = today
                t.average = t.fails.avg
                if t.date.var == 0:
                    t.slope = 0
                else:
                    t.slope = (t.date_fails.avg - t.date.avg * t.fails.avg) / t.date.var
                t.etl.timestamp = Date.now()

            # PUSH STATS TO ES
            docs = [{"id": t._id, "value": t} for t in tests_summary.data if t.fails.sum > 0]
            Log.note("Adding {{num}} test summaries", num=len(docs))
            destination.extend(docs)
Пример #7
0
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop):
    for not_summarized in todo:
        if please_stop:
            return True

        # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION?
        Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name)
        dups = http.post_json(settings.url, json={
            "from": "coverage",
            "select": [
                {"name": "max_id", "value": "etl.source.id", "aggregate": "max"},
                {"name": "min_id", "value": "etl.source.id", "aggregate": "min"}
            ],
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url"
            ],
            "limit": 100000,
            "format": "list"
        })

        dups_found = False
        for d in dups.data:
            if d.max_id != d.min_id:
                dups_found = True

                Log.note(
                    "removing dups {{details|json}}\n{{dups|json|indent}}",
                    details={
                        "id": int(d.max_id),
                        "test": d.test.url,
                        "source": not_summarized.source.file.name,
                        "revision": not_summarized.build.revision12
                    }
                )

                # FIND ALL INDEXES
                all_indexes = [
                    p.index
                    for p in coverage_index.cluster.get_aliases()
                    if p.alias == coverage_index.settings.alias
                ]
                for index_name in all_indexes:
                    elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [
                        {"not": {"term": {"etl.source.id": int(d.max_id)}}},
                        {"term": {"test.url": d.test.url}},
                        {"term": {"source.file.name": not_summarized.source.file.name}},
                        {"term": {"build.revision12": not_summarized.build.revision12}}
                    ]})
        if dups_found:
            continue

        # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED
        test_count = http.post_json(settings.url, json={
            "from": "coverage.source.file.covered",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url",
                "line"
            ],
            "limit": 100000,
            "format": "list"
        })

        all_tests_covering_file = UNION(test_count.data.get("test.url"))
        num_tests = len(all_tests_covering_file)
        max_siblings = num_tests - 1
        Log.note(
            "{{filename}} rev {{revision}} is covered by {{num}} tests",
            filename=not_summarized.source.file.name,
            num=num_tests,
            revision=not_summarized.build.revision12
        )
        line_summary = list(
            (k, unwrap(wrap(list(v)).get("test.url")))
            for k, v in jx.groupby(test_count.data, keys="line")
        )

        # PULL THE RAW RECORD FOR MODIFICATION
        file_level_coverage_records = http.post_json(settings.url, json={
            "from": "coverage",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"in": {"test.url": all_tests_covering_file}},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }}
            ]},
            "limit": 100000,
            "format": "list"
        })

        for test_name in all_tests_covering_file:
            siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names]
            min_siblings = MIN(siblings)
            coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name)
            if coverage_candidates:

                if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates):
                    Log.warning(
                        "Duplicate coverage\n{{cov|json|indent}}",
                        cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates]
                    )

                # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE
                for coverage_record in coverage_candidates:
                    coverage_record.source.file.max_test_siblings = max_siblings
                    coverage_record.source.file.min_line_siblings = min_siblings
                    coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1)
            else:
                example = http.post_json(settings.url, json={
                    "from": "coverage",
                    "where": {"eq": {
                        "test.url": test_name,
                        "source.file.name": not_summarized.source.file.name,
                        "build.revision12": not_summarized.build.revision12
                    }},
                    "limit": 1,
                    "format": "list"
                })

                Log.warning(
                    "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}",
                    test=test_name,
                    file=not_summarized.source.file.name,
                    revision=not_summarized.build.revision12,
                    example=example.data[0]
                )

        bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None]
        if bad_example:
            Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0])

        rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data]
        coverage_summary_index.extend(rows)
        coverage_index.extend(rows)

        all_test_summary = []
        for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"):
            cov = UNION(records.source.file.covered)
            uncov = UNION(records.source.file.uncovered)
            coverage = {
                "_id": "|".join([records[0].build.revision12, g["source.file.name"]]),  # SOMETHING UNIQUE, IN CASE WE RECALCULATE
                "source": {
                    "file": {
                        "name": g["source.file.name"],
                        "is_file": True,
                        "covered": jx.sort(cov, "line"),
                        "uncovered": jx.sort(uncov),
                        "total_covered": len(cov),
                        "total_uncovered": len(uncov),
                        "min_line_siblings": 0  # PLACEHOLDER TO INDICATE DONE
                    }
                },
                "build": records[0].build,
                "repo": records[0].repo,
                "run": records[0].run,
                "etl": {"timestamp": Date.now()}
            }
            all_test_summary.append(coverage)

        sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary]
        coverage_summary_index.extend(sum_rows)

        if DEBUG:
            coverage_index.refresh()
            todo = http.post_json(settings.url, json={
                "from": "coverage",
                "where": {"and": [
                    {"missing": "source.method.name"},
                    {"missing": "source.file.min_line_siblings"},
                    {"eq": {"source.file.name": not_summarized.source.file.name}},
                    {"eq": {"build.revision12": not_summarized.build.revision12}}
                ]},
                "format": "list",
                "limit": 10
            })
            if todo.data:
                Log.error("Failure to update")
Пример #8
0
    def _get_spot_prices_from_aws(self):
        with Timer("Read pricing file"):
            try:
                content = File(self.settings.price_file).read()
                cache = convert.json2value(content, flexible=False, leaves=False)
            except Exception as e:
                cache = FlatList()

        most_recents = jx.run({
            "from": cache,
            "edges": ["instance_type", "availability_zone"],
            "select": {"value": "timestamp", "aggregate": "max"}
        })

        zones = self._get_valid_availability_zones()
        prices = set(cache)
        with Timer("Get pricing from AWS"):
            for instance_type in self.settings.utility.keys():
                for zone in zones:
                    if cache:
                        most_recent = most_recents[{
                            "instance_type": instance_type,
                            "availability_zone": zone
                        }].timestamp
                        start_at = MAX([Date(most_recent), Date.today() - WEEK])
                    else:
                        start_at = Date.today() - WEEK

                    if DEBUG_PRICING:
                        Log.note("get pricing for {{instance_type}} starting at {{start_at}}",
                            instance_type=instance_type,
                            start_at=start_at
                        )

                    next_token = None
                    while True:
                        resultset = self.ec2_conn.get_spot_price_history(
                            product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"),
                            instance_type=instance_type,
                            availability_zone=zone,
                            start_time=start_at.format(ISO8601),
                            next_token=next_token
                        )
                        next_token = resultset.next_token

                        for p in resultset:
                            prices.add(wrap({
                                "availability_zone": p.availability_zone,
                                "instance_type": p.instance_type,
                                "price": p.price,
                                "product_description": p.product_description,
                                "region": p.region.name,
                                "timestamp": Date(p.timestamp).unix
                            }))

                        if not next_token:
                            break

        with Timer("Save prices to file"):
            new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}})
            def stream():  # IT'S A LOT OF PRICES, STREAM THEM TO FILE
                prefix = "[\n"
                for p in new_prices:
                    yield prefix
                    yield convert.value2json(p)
                    prefix = ",\n"
                yield "]"
            File(self.settings.price_file).write(stream())

        return prices
Пример #9
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS
    line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}')

    # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED
    if source.name.startswith("active-data-test-result"):
        # "suite": {"flavor": "plain-chunked", "name": "mochitest"}
        found = strings.between(line, '"suite": {', '}')
        if found:
            suite_json = '{' + found + "}"
            if suite_json:
                suite = convert.json2value(suite_json)
                suite = convert.value2json(coalesce(suite.fullname, suite.name))
                line = line.replace(suite_json, suite)

    if rownum == 0:
        value = convert.json2value(line)
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
        if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        value = convert.json2value(line)
        _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    elif line.find('"resource_usage":') != -1:
        value = convert.json2value(line)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    else:
        # FAST
        _id = strings.between(line, "\"_id\": \"", "\"")  # AVOID DECODING JSON
        row = {"id": _id, "json": line}

    return row, False