Exemplo n.º 1
0
    def get_pushes(
        self,
        apply_filters: bool = False
    ) -> Tuple[List[Dict[str, List[str]]], float]:
        pushes = []
        for revs, test_datas in test_scheduling.get_test_scheduling_history(
                self.granularity):
            failures = []
            passes = []

            for test_data in test_datas:
                name = test_data["name"]

                if self.granularity == "label" and not name.startswith(
                        "test-"):
                    continue

                if (test_data["is_likely_regression"]
                        or test_data["is_possible_regression"]):
                    failures.append(name)
                else:
                    passes.append(name)

            if apply_filters:
                if self.failures_skip and len(failures) > self.failures_skip:
                    continue

            pushes.append({
                "revs": revs,
                "failures": failures,
                "passes": passes,
            })

        return pushes, math.floor(0.9 * len(pushes))
Exemplo n.º 2
0
    def items_gen(self, classes):
        commit_map = {}

        for commit in repository.get_commits():
            commit_map[commit["node"]] = commit

        assert len(commit_map) > 0

        for test_data in test_scheduling.get_test_scheduling_history(
                self.granularity):
            revs = test_data["revs"]
            name = test_data["name"]

            if (revs[0], name) not in classes:
                continue

            commits = tuple(commit_map[revision]
                            for revision in test_data["revs"]
                            if revision in commit_map)
            if len(commits) == 0:
                continue

            commit_data = commit_features.merge_commits(commits)
            commit_data["test_job"] = test_data
            yield commit_data, classes[(revs[0], name)]
Exemplo n.º 3
0
    def get_pushes(self):
        if self.use_subset:
            random.seed(0)

        pushes = []
        for revs, test_datas in test_scheduling.get_test_scheduling_history(
                self.granularity):
            failures = []
            passes = []

            for test_data in test_datas:
                name = test_data["name"]

                if self.granularity == "label" and not name.startswith(
                        "test-"):
                    continue

                if (test_data["is_likely_regression"]
                        or test_data["is_possible_regression"]):
                    failures.append(name)
                else:
                    passes.append(name)

            if self.use_subset:
                passes = random.sample(passes, math.ceil(len(passes) / 10))

            pushes.append({
                "revs": revs,
                "failures": failures,
                "passes": passes,
            })

        return pushes, math.floor(0.9 * len(pushes))
Exemplo n.º 4
0
    def items_gen(self, classes):
        commit_map = {}

        for commit in repository.get_commits():
            commit_map[commit["node"]] = commit

        assert len(commit_map) > 0

        done = set()
        for test_data in test_scheduling.get_test_scheduling_history("label"):
            revs = test_data["revs"]

            if revs[0] in done:
                continue

            if revs[0] not in classes:
                continue

            done.add(revs[0])

            commits = tuple(commit_map[revision] for revision in revs
                            if revision in commit_map)
            if len(commits) == 0:
                continue

            commit_data = commit_features.merge_commits(commits)
            yield commit_data, classes[revs[0]]
Exemplo n.º 5
0
    def get_pushes(
        self, apply_filters: bool = False
    ) -> tuple[list[dict[str, Any]], int]:
        pushes = []
        for revs, test_datas in test_scheduling.get_test_scheduling_history(
            self.granularity
        ):
            failures = []
            passes = []

            for test_data in test_datas:
                name = test_data["name"]

                if (
                    test_data["is_likely_regression"]
                    or test_data["is_possible_regression"]
                ):
                    failures.append(name)
                else:
                    passes.append(name)

            if apply_filters:
                if self.failures_skip and len(failures) > self.failures_skip:
                    continue

            pushes.append(
                {
                    "revs": revs,
                    "failures": failures,
                    "passes": passes,
                }
            )

        return pushes, math.floor(0.9 * len(pushes))
Exemplo n.º 6
0
    def get_labels(self):
        classes = {}

        for revs, test_datas in test_scheduling.get_test_scheduling_history("label"):
            rev = revs[0]

            if any(
                test_data["is_likely_regression"] or test_data["is_possible_regression"]
                for test_data in test_datas
            ):
                classes[rev] = 1
            else:
                classes[rev] = 0

        print(
            "{} commits failed".format(
                sum(1 for label in classes.values() if label == 1)
            )
        )
        print(
            "{} commits did not fail".format(
                sum(1 for label in classes.values() if label == 0)
            )
        )

        return classes, [0, 1]
Exemplo n.º 7
0
    def get_labels(self):
        classes = {}
        classes_by_rev = defaultdict(dict)

        random.seed(0)

        revs = set()
        for test_data in test_scheduling.get_test_scheduling_history(
                self.granularity):
            rev = test_data["revs"][0]
            name = test_data["name"]

            revs.add(rev)

            if self.granularity == "label" and not name.startswith("test-"):
                continue

            if test_data["is_likely_regression"] or test_data[
                    "is_possible_regression"]:
                classes_by_rev[rev][name] = 1
            else:
                classes_by_rev[rev][name] = 0

        if self.use_subset:
            for rev, by_name in classes_by_rev.items():
                passing_names = [
                    name for name, val in by_name.items() if val == 0
                ]
                if len(passing_names) == 0:
                    continue

                chosen_passing_names = random.sample(
                    passing_names, math.ceil(len(passing_names) / 10))
                assert len(chosen_passing_names) > 0

                to_delete = set(passing_names) - set(chosen_passing_names)
                for name in to_delete:
                    del by_name[name]

        classes = {(rev, name): val
                   for rev, by_name in classes_by_rev.items()
                   for name, val in by_name.items()}

        print("{} pushes considered".format(len(classes_by_rev)))
        print("{} push/jobs failed".format(
            sum(1 for label in classes.values() if label == 1)))
        print("{} push/jobs did not fail".format(
            sum(1 for label in classes.values() if label == 0)))

        return classes, [0, 1]
Exemplo n.º 8
0
    def get_labels(self):
        classes = {}
        pushes = {}

        for test_data in test_scheduling.get_test_scheduling_history(
                self.granularity):
            rev = test_data["revs"][0]
            name = test_data["name"]

            if self.granularity == "label" and not name.startswith("test-"):
                continue

            if rev not in pushes:
                pushes[rev] = {
                    "failures": [],
                    "passes": [],
                }

            if test_data["is_likely_regression"] or test_data[
                    "is_possible_regression"]:
                pushes[rev]["failures"].append(name)
            else:
                pushes[rev]["passes"].append(name)

        if self.use_subset:
            random.seed(0)

            for rev, push in pushes.items():
                push["passes"] = random.sample(
                    push["passes"], math.ceil(len(push["passes"]) / 10))

        for rev, push in pushes.items():
            for name in push["failures"]:
                classes[(rev, name)] = 1

            for name in push["passes"]:
                classes[(rev, name)] = 0

        print("{} pushes considered".format(len(pushes)))
        print("{} pushes with at least one failure".format(
            sum(1 for push in pushes.values() if len(push["failures"]) > 0)))
        print("{} push/jobs failed".format(
            sum(1 for label in classes.values() if label == 1)))
        print("{} push/jobs did not fail".format(
            sum(1 for label in classes.values() if label == 0)))

        return classes, [0, 1]
Exemplo n.º 9
0
    def get_labels(self):
        classes = {}

        for test_data in test_scheduling.get_test_scheduling_history():
            rev = test_data["revs"][0]

            if test_data["is_likely_regression"] or test_data[
                    "is_possible_regression"]:
                classes[rev] = 1
            elif rev not in classes:
                classes[rev] = 0

        print("{} commits failed".format(
            sum(1 for label in classes.values() if label == 1)))
        print("{} commits did not fail".format(
            sum(1 for label in classes.values() if label == 0)))

        return classes, [0, 1]
Exemplo n.º 10
0
    def items_gen(self, classes):
        commit_map = get_commit_map()

        for revs, test_datas in test_scheduling.get_test_scheduling_history(
                self.granularity):
            commits = tuple(commit_map[revision] for revision in revs
                            if revision in commit_map)
            if len(commits) == 0:
                continue

            for test_data in test_datas:
                name = test_data["name"]

                if (revs[0], name) not in classes:
                    continue

                commit_data = commit_features.merge_commits(commits)
                commit_data["test_job"] = test_data
                yield commit_data, classes[(revs[0], name)]
Exemplo n.º 11
0
    def items_gen(self, classes):
        commit_map = {}

        for commit in repository.get_commits():
            commit_map[commit["node"]] = commit

        assert len(commit_map) > 0

        for revs, test_datas in test_scheduling.get_test_scheduling_history("label"):
            if revs[0] not in classes:
                continue

            commits = tuple(
                commit_map[revision] for revision in revs if revision in commit_map
            )
            if len(commits) == 0:
                continue

            commit_data = commit_features.merge_commits(commits)
            yield commit_data, classes[revs[0]]
Exemplo n.º 12
0
    def train_test_split(self, X, y):
        pushes = OrderedDict()
        for test_data in test_scheduling.get_test_scheduling_history(
                self.granularity):
            rev = test_data["revs"][0]
            name = test_data["name"]

            if self.granularity == "label" and not name.startswith("test-"):
                continue

            if rev in pushes:
                pushes[rev] += 1
            else:
                pushes[rev] = 1

        train_push_len = math.floor(0.9 * len(pushes))
        train_pushes = list(pushes.values())[:train_push_len]
        train_len = sum(count for count in train_pushes)
        print(
            f"{train_push_len} pushes in the training set (corresponding to {train_len} push/jobs)"
        )
        return X[:train_len], X[train_len:], y[:train_len], y[train_len:]
Exemplo n.º 13
0
    def get_labels(self):
        classes = {}

        for test_data in test_scheduling.get_test_scheduling_history():
            rev = test_data["revs"][0]
            name = test_data["name"]

            if not name.startswith("test-"):
                continue

            if test_data["is_likely_regression"] or test_data[
                    "is_possible_regression"]:
                classes[(rev, name)] = 1
            else:
                classes[(rev, name)] = 0

        print("{} commit/jobs failed".format(
            sum(1 for label in classes.values() if label == 1)))
        print("{} commit/jobs did not fail".format(
            sum(1 for label in classes.values() if label == 0)))

        return classes, [0, 1]
Exemplo n.º 14
0
    def items_gen(self, classes):
        commit_map = {}

        for commit in repository.get_commits():
            commit_map[commit["node"]] = commit

        assert len(commit_map) > 0

        # TODO: Data from multiple commits in the same push should be merged.
        for test_data in test_scheduling.get_test_scheduling_history():
            rev = test_data["revs"][0]
            name = test_data["name"]

            if (rev, name) not in classes:
                continue

            if rev not in commit_map:
                continue

            commit_data = commit_map[rev]
            commit_data["test_job"] = test_data
            yield commit_data, classes[(rev, name)]
Exemplo n.º 15
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["revs"][0]
        else:
            last_node = None

        past_failures = shelve.Shelf(
            LMDBDict("data/past_failures.lmdb"),
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 20:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for task in tasks_to_consider:
                    is_regression = (task in possible_regressions
                                     or task in likely_regressions)

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     merged_commits["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     merged_commits["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        merged_commits["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        merged_commits["components"],
                        push_num,
                        is_regression,
                    )

                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)

                        yield {
                            "revs": revisions,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in possible_regressions,
                            "is_likely_regression": task in likely_regressions,
                        }

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Exemplo n.º 16
0
    def generate_test_scheduling_history(self, granularity):
        push_data_path = f"push_data_{granularity}.json"
        updated = download_check_etag(
            test_scheduling.PUSH_DATA_URL.format(granularity=granularity))
        if updated:
            zstd_decompress(push_data_path)
        assert os.path.exists(
            push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity])

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB)
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB)
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB)

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for test_data in test_scheduling.get_test_scheduling_history(
                granularity):
            last_node = test_data["revs"][0]

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open(push_data_path, "r") as f:
                push_data = json.load(f)

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [(
                    revisions,
                    rename_tasks(push_tasks),
                    rename_tasks(possible_regressions),
                    rename_tasks(likely_regressions),
                ) for revisions, push_tasks, possible_regressions,
                             likely_regressions in push_data]

            # In the last 28 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables
                     for _, push_runnables, _, _ in push_data[-28:]), []))
            # Filter runnables we don't need.
            all_runnables = filter_runnables(list(all_runnables_set),
                                             all_runnables_set, granularity)
            all_runnables_set = set(all_runnables_set)
            logger.info(
                f"{len(all_runnables_set)} runnables run in the last 28 pushes"
            )

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # So we consider only the runnables which run in this push, and the possible and likely regressions
                # from this push.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions +
                        likely_regressions))
                runnables_to_consider = filter_runnables(
                    runnables_to_consider, all_runnables_set, granularity)

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        runnables_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)
                        data["revs"] = revisions
                        yield data

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(
                f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)

        with open_tar_zst(past_failures_db) as tar:
            tar.add(past_failures_db[:-len(".tar.zst")])

        if granularity == "group":
            with open_tar_zst(touched_together_db) as tar:
                tar.add(touched_together_db[:-len(".tar.zst")])
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        try:
            with open("data/past_failures.pickle", "rb") as f:
                past_failures, push_num = pickle.load(f)
        except FileNotFoundError:
            past_failures = {}
            push_num = 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            if type_ not in past_failures:
                past_failures[type_] = {}

            if task not in past_failures[type_]:
                past_failures[type_][task] = {}

            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            for item in items:
                if item not in past_failures[type_][task]:
                    past_failures[type_][task][item] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )

                value = past_failures[type_][task][item][push_num]

                values_total.append(value)
                values_prev_7.append(
                    value - past_failures[type_][task][item][push_num - 7]
                )
                values_prev_14.append(
                    value - past_failures[type_][task][item][push_num - 14]
                )
                values_prev_28.append(
                    value - past_failures[type_][task][item][push_num - 28]
                )
                values_prev_56.append(
                    value - past_failures[type_][task][item][push_num - 56]
                )

                if is_regression:
                    past_failures[type_][task][item][push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open("data/past_failures.pickle", "wb") as f:
            pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL)

        zstd_compress("data/past_failures.pickle")
    def generate_test_scheduling_history(self, granularity):
        push_data_path = f"push_data_{granularity}.json"
        updated = download_check_etag(
            test_scheduling.PUSH_DATA_URL.format(granularity=granularity)
        )
        if updated:
            zstd_decompress(push_data_path)
            os.remove(f"{push_data_path}.zst")
        assert os.path.exists(push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity]
        )

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB
            )
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB
            )
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB
            )
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB
            )

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for revs, _ in test_scheduling.get_test_scheduling_history(granularity):
            last_node = revs[0]

        def generate_failing_together_probabilities(push_data):
            # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and
            # `task2 failure -> task1 failure` separately, as they could be different.

            count_runs = collections.Counter()
            count_single_failures = collections.Counter()
            count_both_failures = collections.Counter()

            for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
                push_data
            ):
                failures = set(likely_regressions + candidate_regressions)
                all_tasks = list(set(tasks) | failures)

                for task1, task2 in itertools.combinations(sorted(all_tasks), 2):
                    count_runs[(task1, task2)] += 1

                    if task1 in failures:
                        if task2 in failures:
                            count_both_failures[(task1, task2)] += 1
                        else:
                            count_single_failures[(task1, task2)] += 1
                    elif task2 in failures:
                        count_single_failures[(task1, task2)] += 1

            stats = {}

            skipped = 0

            for couple, run_count in count_runs.most_common():
                failure_count = count_both_failures[couple]
                support = failure_count / run_count

                if support < 1 / 700:
                    skipped += 1
                    continue

                if failure_count != 0:
                    confidence = failure_count / (
                        count_single_failures[couple] + failure_count
                    )
                else:
                    confidence = 0.0

                stats[couple] = (support, confidence)

            logger.info(f"{skipped} couples skipped because their support was too low")

            logger.info("Redundancies with the highest support and confidence:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], -k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            logger.info("Redundancies with the highest confidence and lowest support:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            failing_together = test_scheduling.get_failing_together_db()
            count_redundancies = collections.Counter()
            for couple, (support, confidence) in stats.items():
                if confidence == 1.0:
                    count_redundancies["==100%"] += 1
                if confidence > 0.9:
                    count_redundancies[">=90%"] += 1
                if confidence > 0.8:
                    count_redundancies[">=80%"] += 1
                if confidence > 0.7:
                    count_redundancies[">=70%"] += 1

                if confidence < 0.7:
                    continue

                failing_together[
                    f"{couple[0]}${couple[1]}".encode("utf-8")
                ] = struct.pack("ff", support, confidence)

            for percentage, count in count_redundancies.most_common():
                logger.info(f"{count} with {percentage} confidence")

            test_scheduling.close_failing_together_db()

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures["push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open(push_data_path, "r") as f:
                push_data = json.load(f)

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [
                    (
                        revisions,
                        rename_tasks(push_tasks),
                        rename_tasks(possible_regressions),
                        rename_tasks(likely_regressions),
                    )
                    for revisions, push_tasks, possible_regressions, likely_regressions in push_data
                ]

            # In the last 28 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])
            )
            # Filter runnables we don't need.
            all_runnables = filter_runnables(
                list(all_runnables_set), all_runnables_set, granularity
            )
            all_runnables_set = set(all_runnables_set)
            logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes")

            push_data = [
                (
                    revisions,
                    filter_runnables(push_tasks, all_runnables_set, granularity),
                    filter_runnables(
                        possible_regressions, all_runnables_set, granularity
                    ),
                    filter_runnables(
                        likely_regressions, all_runnables_set, granularity
                    ),
                )
                for revisions, push_tasks, possible_regressions, likely_regressions in push_data
            ]

            if granularity == "label":
                generate_failing_together_probabilities(push_data)

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together()
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision)
                    for revision in revisions
                    if revision in commit_map
                )
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions + likely_regressions)
                )

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                result = {
                    "revs": revisions,
                    "data": [],
                }
                for data in test_scheduling.generate_data(
                    past_failures,
                    merged_commits,
                    push_num,
                    runnables_to_consider,
                    possible_regressions,
                    likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result["data"].append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield result

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)

        with open_tar_zst(past_failures_db) as tar:
            tar.add(past_failures_db[: -len(".tar.zst")])

        if granularity == "group":
            with open_tar_zst(touched_together_db) as tar:
                tar.add(touched_together_db[: -len(".tar.zst")])

        if granularity == "label":
            with open_tar_zst(failing_together_db) as tar:
                tar.add(failing_together_db[: -len(".tar.zst")])
Exemplo n.º 19
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL)
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

        last_node = None
        for test_data in test_scheduling.get_test_scheduling_history():
            last_node = test_data["revs"][0]

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures()

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # Store all tasks in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_tasks"] = all_tasks
            # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        tasks_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)
                        data["revs"] = revisions
                        yield data

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Exemplo n.º 20
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        logger.info(f"push data nodes: {len(push_data)}")

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        past_failures = shelve.open(
            "data/past_failures.shelve",
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if len(commits_with_data) % 1000 == 0:
                    past_failures.sync()

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (task in commit_push_data[1]
                                     or task in commit_push_data[2])

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     commit_data["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     commit_data["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                # We no longer need the push data for this node, we can free the memory.
                del push_data[node]

                push_num += 1

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        zstd_compress("data/past_failures.shelve")