Exemplo n.º 1
0
def plot_graphs(granularity: str) -> None:
    push_data_db = (
        test_scheduling.PUSH_DATA_GROUP_DB
        if granularity == "group"
        else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB
    )
    assert db.download(push_data_db)

    regressions_by_rev = {}
    for revisions, _, _, possible_regressions, likely_regressions in db.read(
        push_data_db
    ):
        regressions_by_rev[revisions[0]] = get_regressions(
            granularity, likely_regressions, possible_regressions
        )

    scheduled_data = []
    caught_data = []

    for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB):
        if len(scheduler_stat["schedulers"]) == 0:
            continue

        if scheduler_stat["id"] not in regressions_by_rev:
            continue

        obj: dict[str, Any] = {
            "date": datetime.utcfromtimestamp(scheduler_stat["date"]),
        }

        for scheduler in scheduler_stat["schedulers"]:
            obj[scheduler["name"]] = len(get_scheduled(granularity, scheduler))

        scheduled_data.append(obj)

        regressions = regressions_by_rev[scheduler_stat["id"]]

        obj = {
            "date": datetime.utcfromtimestamp(scheduler_stat["date"]),
            "regressions": len(regressions),
        }

        for scheduler in scheduler_stat["schedulers"]:
            scheduled = get_scheduled(granularity, scheduler)

            obj[scheduler["name"]] = len(regressions & scheduled)

        caught_data.append(obj)

    scheduled_df = DataFrame(scheduled_data)
    scheduled_df.index = scheduled_df["date"]
    del scheduled_df["date"]

    caught_df = DataFrame(caught_data)
    caught_df.index = caught_df["date"]
    del caught_df["date"]

    df = scheduled_df.resample("W").mean()

    plot_graph(
        df,
        f"Average number of scheduled {granularity}s",
        f"average_{granularity}_scheduled.svg",
    )

    df = (
        caught_df[caught_df.regressions > 0]
        .drop(columns=["regressions"])
        .clip(0, 1)
        .resample("W")
        .mean()
    )

    plot_graph(
        df,
        "Percentage of regressing pushes where we caught at least one regression",
        f"percentage_{granularity}_caught_at_least_one.svg",
    )

    plot_graph(
        caught_df.drop(columns=["regressions"])
        .div(caught_df.regressions, axis=0)
        .resample("W")
        .mean(),
        "Percentage of regressions we caught",
        f"percentage_{granularity}_caught.svg",
    )
Exemplo n.º 2
0
def go(months: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    known_scheduler_stats = {
        scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)
    }
    logger.info(
        f"Already gathered statistics for {len(known_scheduler_stats)} pushes..."
    )

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(months=months)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes = [push for push in pushes if push.rev not in known_scheduler_stats]
    logger.info(f"{len(pushes)} left to analyze")

    def compress_and_upload() -> None:
        utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
        db.upload(SHADOW_SCHEDULER_STATS_DB)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = tuple(
            executor.submit(
                analyze_shadow_schedulers,
                push,
            )
            for push in pushes
        )
        del pushes

        def results():
            start_time = time.monotonic()

            try:
                for future in tqdm(
                    concurrent.futures.as_completed(futures),
                    total=len(futures),
                ):
                    try:
                        yield future.result()
                    except Exception:
                        traceback.print_exc()

                    # Upload every 10 minutes.
                    if time.monotonic() - start_time >= 600:
                        compress_and_upload()
                        start_time = time.monotonic()
            except Exception:
                for f in futures:
                    f.cancel()

                raise

        db.append(SHADOW_SCHEDULER_STATS_DB, results())

    compress_and_upload()
Exemplo n.º 3
0
 def retrieve_push_data(self):
     # Download previous cache.
     db.download(ADR_CACHE_DB)
     self.generate_push_data("label")
     self.generate_push_data("group")
     self.upload_adr_cache()
Exemplo n.º 4
0
def main(args):
    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier)

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(args.training_set_size, args.lemmatization,
                                args.cleanup_urls)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )

            if model.calculate_importance:
                probas, importance = model.classify(bug,
                                                    probabilities=True,
                                                    importances=True)

                feature_names = model.get_human_readable_feature_names()

                model.print_feature_importances(importance["importances"],
                                                feature_names,
                                                class_probabilities=probas)
            else:
                probas = model.classify(bug,
                                        probabilities=True,
                                        importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (args.token is not None
                ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bug_ids = bugzilla.get_ids_between(a_week_ago, today)
        bugs = bugzilla.get(bug_ids)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs.values():
            p = model.classify(bug, probabilities=True)
            rows.append([
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                "y" if p[0][1] >= 0.7 else "n",
                "",
                bug["summary"],
            ])

        os.makedirs("sheets", exist_ok=True)
        with open(
                os.path.join(
                    "sheets",
                    f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
                ),
                "w",
        ) as f:
            writer = csv.writer(f)
            writer.writerows(rows)
Exemplo n.º 5
0
Arquivo: run.py Projeto: sbeesm/bugbug
        model_class = TrackingModel
    elif args.goal == 'qaneeded':
        from bugbug.models.qaneeded import QANeededModel
        model_class = QANeededModel
    elif args.goal == 'uplift':
        from bugbug.models.uplift import UpliftModel
        model_class = UpliftModel
    elif args.goal == 'component':
        from bugbug.models.component import ComponentModel
        model_class = ComponentModel
    elif args.goal == 'devdocneeded':
        from bugbug.models.devdocneeded import DevDocNeededModel
        model_class = DevDocNeededModel

    if args.train:
        db.download()

        model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ')
            probas, importances = model.classify(bug, probabilities=True, importances=True)

            feature_names = model.get_feature_names()
            for i, (index, is_positive, contrib) in enumerate(importances[:20]):
                print(f'{i + 1}. \'{feature_names[index]}\' ({"+" if is_positive else "-"}{contrib})')
Exemplo n.º 6
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        logger.info("Downloading bugs database...")
        if db.is_old_version(
                bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB, force=True)

        logger.info("Download previous classifications...")
        if db.is_old_version(
                BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB):
            db.download(BUG_FIXING_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        download_model("defectenhancementtask")
        defect_model = DefectEnhancementTaskModel.load(
            "defectenhancementtaskmodel")

        logger.info("Downloading regression model...")
        download_model("regression")
        regression_model = RegressionModel.load("regressionmodel")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)

        bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
        return [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ["r", "d"]
        ]
Exemplo n.º 7
0
def classify_bugs(model_name, classifier, bug_id):
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(
            f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_check_etag(
                f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst",
                f"{model_file_name}.zst",
            )
        except requests.HTTPError:
            logger.error(
                f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

        zstd_decompress(model_file_name)
        assert os.path.exists(
            model_file_name), "Decompressed file doesn't exist"

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if bug_id:
        bugs = bugzilla.get(bug_id).values()
        assert bugs, f"A bug with a bug id of {bug_id} was not found"
    else:
        assert db.download(bugzilla.BUGS_DB)
        bugs = bugzilla.get_bugs()

    for bug in bugs:
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(bug,
                                                probabilities=True,
                                                importances=True)

            model.print_feature_importances(importance["importances"],
                                            class_probabilities=probas)
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        probability = probas[0]
        pred_index = np.argmax(probability)
        if len(probability) > 2:
            pred_class = model.le.inverse_transform([pred_index])[0]
        else:
            pred_class = "Positive" if pred_index == 1 else "Negative"
        print(f"{pred_class} {probability}")
        input()
Exemplo n.º 8
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        db.download(db_path)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            thread_local.git = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        bug_fixing_commits_queue = bug_fixing_commits.copy()

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:

            def results():
                num_analyzed = 0

                # Analyze up to 500 commits at a time, to avoid the task running out of time.
                while len(
                        bug_fixing_commits_queue) != 0 and num_analyzed != 500:
                    bug_introducing_commit_futures = []
                    for _ in range(
                            min(500 - num_analyzed,
                                len(bug_fixing_commits_queue))):
                        bug_introducing_commit_futures.append(
                            executor.submit(find_bic,
                                            bug_fixing_commits_queue.pop()))

                    logger.info(
                        f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits"
                    )

                    for future in tqdm(
                            concurrent.futures.as_completed(
                                bug_introducing_commit_futures),
                            total=len(bug_introducing_commit_futures),
                    ):
                        result = future.result()
                        if result is not None:
                            num_analyzed += 1
                            yield from result

            db.append(db_path, results())

        zstd_compress(db_path)

        return len(bug_fixing_commits_queue) == 0
Exemplo n.º 9
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        db.download(IGNORED_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(f"Already found {len(prev_commits_to_ignore)} commits to ignore...")

        # When we already have some analyzed commits, re-analyze the last 3500 to make sure
        # we didn't miss back-outs that happened since the last analysis.
        if len(prev_commits_to_ignore) > 0:
            first_commit_to_reanalyze = (
                -3500 if len(prev_commits_to_ignore) >= 3500 else 0
            )
            rev_start = "children({})".format(
                prev_commits_to_ignore[first_commit_to_reanalyze]["rev"]
            )
        else:
            rev_start = 0

        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        # Drop commits which are not yet present in the mercurial <-> git mapping.
        while len(revs) > 0:
            try:
                vcs_map.mercurial_to_git(revs[-1].decode("ascii"))
                break
            except Exception as e:
                if not str(e).startswith("Missing mercurial commit in the VCS map"):
                    raise

                revs.pop()

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        repository.set_commits_to_ignore(self.mercurial_repo_dir, commits)

        chosen_commits = set()
        commits_to_ignore = []
        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append(
                    {
                        "rev": commit.node,
                        "type": "backedout" if commit.backedoutby else "",
                    }
                )
                chosen_commits.add(commit.node)

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        for prev_commit in prev_commits_to_ignore[::-1]:
            if prev_commit["rev"] not in chosen_commits:
                commits_to_ignore.append(prev_commit)
                chosen_commits.add(prev_commit["rev"])

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        logger.info(
            "...of which {} are backed-out".format(
                sum(1 for commit in commits_to_ignore if commit["type"] == "backedout")
            )
        )

        db.write(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)
        db.upload(IGNORED_COMMITS_DB)
    def generate_test_scheduling_history(self, granularity):
        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity]
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB
            )
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB
            )
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB
            )
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB
            )

        assert db.download(push_data_db)

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for revs, _ in test_scheduling.get_test_scheduling_history(granularity):
            last_node = revs[0]

        def generate_failing_together_probabilities(push_data):
            # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and
            # `task2 failure -> task1 failure` separately, as they could be different.

            count_runs = collections.Counter()
            count_single_failures = collections.Counter()
            count_both_failures = collections.Counter()

            for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
                push_data
            ):
                failures = set(likely_regressions + candidate_regressions)
                all_tasks = list(set(tasks) | failures)

                for task1, task2 in itertools.combinations(sorted(all_tasks), 2):
                    count_runs[(task1, task2)] += 1

                    if task1 in failures:
                        if task2 in failures:
                            count_both_failures[(task1, task2)] += 1
                        else:
                            count_single_failures[(task1, task2)] += 1
                    elif task2 in failures:
                        count_single_failures[(task1, task2)] += 1

            stats = {}

            skipped = 0

            for couple, run_count in count_runs.most_common():
                failure_count = count_both_failures[couple]
                support = failure_count / run_count

                if support < 1 / 700:
                    skipped += 1
                    continue

                if failure_count != 0:
                    confidence = failure_count / (
                        count_single_failures[couple] + failure_count
                    )
                else:
                    confidence = 0.0

                stats[couple] = (support, confidence)

            logger.info(f"{skipped} couples skipped because their support was too low")

            logger.info("Redundancies with the highest support and confidence:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], -k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            logger.info("Redundancies with the highest confidence and lowest support:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            failing_together = test_scheduling.get_failing_together_db()
            count_redundancies = collections.Counter()
            for couple, (support, confidence) in stats.items():
                if confidence == 1.0:
                    count_redundancies["==100%"] += 1
                if confidence > 0.9:
                    count_redundancies[">=90%"] += 1
                if confidence > 0.8:
                    count_redundancies[">=80%"] += 1
                if confidence > 0.7:
                    count_redundancies[">=70%"] += 1

                if confidence < 0.7:
                    continue

                failing_together[
                    f"{couple[0]}${couple[1]}".encode("utf-8")
                ] = struct.pack("ff", support, confidence)

            for percentage, count in count_redundancies.most_common():
                logger.info(f"{count} with {percentage} confidence")

            test_scheduling.close_failing_together_db()

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures["push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            push_data = list(db.read(push_data_db))

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [
                    (
                        revisions,
                        rename_tasks(push_tasks),
                        rename_tasks(possible_regressions),
                        rename_tasks(likely_regressions),
                    )
                    for revisions, push_tasks, possible_regressions, likely_regressions in push_data
                ]

            # In the last 14 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables for _, push_runnables, _, _ in push_data[-14:]), [])
            )
            # Filter runnables we don't need.
            all_runnables = filter_runnables(
                list(all_runnables_set), all_runnables_set, granularity
            )
            all_runnables_set = set(all_runnables_set)
            logger.info(f"{len(all_runnables_set)} runnables run in the last 14 pushes")

            push_data = [
                (
                    revisions,
                    filter_runnables(push_tasks, all_runnables_set, granularity),
                    filter_runnables(
                        possible_regressions, all_runnables_set, granularity
                    ),
                    filter_runnables(
                        likely_regressions, all_runnables_set, granularity
                    ),
                )
                for revisions, push_tasks, possible_regressions, likely_regressions in push_data
            ]

            if granularity == "label":
                generate_failing_together_probabilities(push_data)

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together()
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision)
                    for revision in revisions
                    if revision in commit_map
                )
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions + likely_regressions)
                )

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                result = {
                    "revs": revisions,
                    "data": [],
                }
                for data in test_scheduling.generate_data(
                    past_failures,
                    merged_commits,
                    push_num,
                    runnables_to_consider,
                    possible_regressions,
                    likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result["data"].append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield result

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)
        create_tar_zst(past_failures_db)

        if granularity == "group":
            create_tar_zst(touched_together_db)

        if granularity == "label":
            create_tar_zst(failing_together_db)
Exemplo n.º 11
0
def find_bug_introducing_commits(cache_dir, git_repo_dir):
    mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central")

    logger.info("Downloading Mercurial <-> git mapping file...")
    vcs_map.download_mapfile()

    logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...")
    repository.clone(mercurial_repo_dir)

    logger.info(f"Cloning git repository to {git_repo_dir}...")
    clone_gecko_dev(git_repo_dir)

    logger.info("Download previously found bug-introducing commits...")
    db.download_version(BUG_INTRODUCING_COMMITS_DB)
    if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
        BUG_INTRODUCING_COMMITS_DB
    ):
        db.download(BUG_INTRODUCING_COMMITS_DB, force=True)

    logger.info("Get previously found bug-introducing commits...")
    prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
    prev_bug_introducing_commits_nodes = set(
        bug_introducing_commit["bug_fixing_mercurial_rev"]
        for bug_introducing_commit in prev_bug_introducing_commits
    )
    logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")

    commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)

    git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)

    with open("git_hashes_to_ignore", "w") as f:
        f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)

    bug_fixing_commits = find_bug_fixing_commits()

    logger.info(f"{len(bug_fixing_commits)} commits to analyze")

    # Skip already found bug-introducing commits.
    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
    ]

    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
    )

    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
    ]
    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
    )

    def _init(git_repo_dir):
        global GIT_REPO
        GIT_REPO = GitRepository(git_repo_dir)

    def find_bic(bug_fixing_commit):
        logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))

        commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])

        # Skip huge changes, we'll likely be wrong with them.
        if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
            return [None]

        bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
            commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
        )
        logger.info(bug_introducing_modifications)

        bug_introducing_commits = []
        for bug_introducing_hashes in bug_introducing_modifications.values():
            for bug_introducing_hash in bug_introducing_hashes:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                        "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                        "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
                            bug_introducing_hash
                        ),
                        "bug_introducing_git_rev": bug_introducing_hash,
                    }
                )

        # Add an empty result, just so that we don't reanalyze this again.
        if len(bug_introducing_commits) == 0:
            bug_introducing_commits.append(
                {
                    "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                    "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                    "bug_introducing_mercurial_rev": "",
                    "bug_introducing_git_rev": "",
                }
            )

        return bug_introducing_commits

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
    ) as executor:
        bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
        bug_introducing_commits = tqdm(
            bug_introducing_commits, total=len(bug_fixing_commits)
        )
        bug_introducing_commits = list(
            itertools.chain.from_iterable(bug_introducing_commits)
        )

    total_results_num = len(bug_introducing_commits)
    bug_introducing_commits = list(filter(None, bug_introducing_commits))
    logger.info(
        f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
    )

    db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
    compress_file(BUG_INTRODUCING_COMMITS_DB)
    def generate_test_scheduling_history(self):
        updated = download_check_etag(PUSH_DATA_LABEL_URL)
        if updated:
            zstd_decompress("push_data_label.json")
        assert os.path.exists(
            "push_data_label.json"), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

        last_node = None
        for test_data in test_scheduling.get_test_scheduling_history():
            last_node = test_data["revs"][0]

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures()

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data_label.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            push_data = [(
                revisions,
                rename_tasks(push_tasks),
                rename_tasks(possible_regressions),
                rename_tasks(likely_regressions),
            ) for revisions, push_tasks, possible_regressions,
                         likely_regressions in push_data]

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # Store all tasks in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_tasks"] = all_tasks
            # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        tasks_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)
                        data["revs"] = revisions
                        yield data

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Exemplo n.º 13
0
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (
                parse_risk_band(risk_band)
                for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")
            ),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor"))
        )

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(
            get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
        )

        self.path_to_component = repository.get_component_mapping()

        self.past_regressions_by = {}
        self.past_fixed_bugs_by = {}
        self.past_regression_blocked_bugs_by = {}
        self.past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            self.past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension)
            )
            self.past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)
            )
            self.past_regression_blocked_bugs_by[dimension] = _download_past_bugs(
                PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(dimension=dimension)
            )
            self.past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension)
            )
Exemplo n.º 14
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        logger.info(f"push data nodes: {len(push_data)}")

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        past_failures = shelve.open(
            "data/past_failures.shelve",
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures["push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if len(commits_with_data) % 1000 == 0:
                    past_failures.sync()

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                # We no longer need the push data for this node, we can free the memory.
                del push_data[node]

                push_num += 1

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        zstd_compress("data/past_failures.shelve")
    def generate_test_scheduling_history(self, granularity: str) -> None:
        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity]
        )

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB
            )
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB
            )
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB
            )
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB
            )
        elif granularity == "config_group":
            test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB
            )
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB
            )

        push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data(
            granularity
        )

        def generate_all_data() -> Generator[Dict[str, Any], None, None]:
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures["push_num"] if "push_num" in past_failures else 0

            if granularity in ("label", "config_group"):
                test_scheduling.generate_failing_together_probabilities(
                    granularity, push_data_iter(), push_data_count
                )

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                commit_map[commit_data["node"]] = commit_data

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            if granularity in ("group", "config_group"):
                update_touched_together_gen = test_scheduling.update_touched_together()
                next(update_touched_together_gen)

            for (
                i,
                (revisions, push_runnables, possible_regressions, likely_regressions),
            ) in enumerate(tqdm(push_data_iter(), total=push_data_count)):
                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision)
                    for revision in revisions
                    if revision in commit_map
                )
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions + likely_regressions)
                )

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity in ("group", "config_group"):
                    update_touched_together_gen.send(commits[0]["node"])

                result_data = []
                for data in test_scheduling.generate_data(
                    past_failures,
                    merged_commits,
                    push_num,
                    runnables_to_consider,
                    possible_regressions,
                    likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result_data.append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield {
                        "revs": revisions,
                        "data": result_data,
                    }

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)
        create_tar_zst(past_failures_db)

        if granularity == "group":
            create_tar_zst(touched_together_db)

        if granularity == "label":
            create_tar_zst(failing_together_db)
    def retrieve_test_scheduling_history(self):
        os.makedirs("data", exist_ok=True)

        # Download previous cache.
        cache_path = os.path.abspath("data/adr_cache")
        if not os.path.exists(cache_path):
            try:
                download_check_etag(URL, "adr_cache.tar.xz")
                with tarfile.open("adr_cache.tar.xz", "r:xz") as tar:
                    tar.extractall()
                assert os.path.exists(
                    "data/adr_cache"), "Decompressed adr cache exists"
            except requests.exceptions.HTTPError:
                logger.info("The adr cache is not available yet")

        # Setup adr cache configuration.
        os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
        with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
            f.write(f"""[adr.cache.stores]
file = {{ driver = "file", path = "{cache_path}" }}
            """)

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # We'll use the past 3 months only for training the model, but we use 6 months to calculate
        # the failure statistics.
        subprocess.run(
            [
                "run-adr",
                "ahal/ci-recipes",
                "recipe",
                "-o",
                os.path.abspath("push_data.json"),
                "-f",
                "json",
                "push_data",
                "--",
                "--from",
                "today-6month",
                "--to",
                "today-2day",
                "--branch",
                "autoland",
            ],
            check=True,
            stdout=subprocess.
            DEVNULL,  # Redirect to /dev/null, as the logs are too big otherwise.
        )

        HISTORY_DATE_START = datetime.now() - relativedelta(months=3)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        past_failures = {}

        def get_past_failures(task, push_num):
            if task not in past_failures:
                past_failures[task] = repository.exp_queue(
                    push_num, HISTORICAL_TIMESPAN + 1, 0)

            return past_failures[task][push_num]

        def generate_data():
            commits_with_data = set()
            saved_nodes = set()

            push_num = 0
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if any(task.startswith(j) for j in JOBS_TO_SKIP):
                        continue

                    total_failures = get_past_failures(task, push_num)
                    past_7_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 7)
                    past_14_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 14)
                    past_28_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 28)
                    past_56_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 56)

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                    if task in commit_push_data[1] or task in commit_push_data[
                            2]:
                        past_failures[task][push_num] = total_failures + 1

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar:
            tar.add("data/adr_cache")
Exemplo n.º 17
0
    def find_bug_introducing_commits(self, repo_dir, tokenized):
        from pydriller import GitRepository
        from pydriller.domain.commit import ModificationType

        logger.info("Download commits to ignore...")
        assert db.download(IGNORED_COMMITS_DB)
        commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))

        logger.info("Download bug-fixing classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
            if bug_fixing_commit["type"] in ["r", "d"]
        ]

        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB

        def git_to_mercurial(revs):
            if tokenized:
                return (self.tokenized_git_to_mercurial[rev] for rev in revs)
            else:
                yield from vcs_map.git_to_mercurial(repo_dir, revs)

        def mercurial_to_git(revs):
            if tokenized:
                return (self.mercurial_to_tokenized_git[rev] for rev in revs)
            else:
                yield from vcs_map.mercurial_to_git(repo_dir, revs)

        logger.info("Download previously found bug-introducing commits...")
        db.download(db_path)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            git_hashes = mercurial_to_git(
                commit["rev"] for commit in tqdm(commits_to_ignore)
                if not tokenized
                or commit["rev"] in self.mercurial_to_tokenized_git)
            f.writelines("{}\n".format(git_hash) for git_hash in git_hashes)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        git_init_lock = threading.Lock()

        def _init(git_repo_dir):
            with git_init_lock:
                thread_local.git = GitRepository(git_repo_dir)
                # Call get_head in order to make pydriller initialize the repository.
                thread_local.git.get_head()

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = tuple(
                mercurial_to_git([bug_fixing_commit["rev"]]))[0]

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            def get_modification_path(mod):
                path = mod.new_path
                if (mod.change_type == ModificationType.RENAME
                        or mod.change_type == ModificationType.DELETE):
                    path = mod.old_path
                return path

            bug_introducing_modifications = {}
            for modification in commit.modifications:
                path = get_modification_path(modification)

                if path == "testing/web-platform/meta/MANIFEST.json":
                    continue

                # Don't try to find the bug-introducing commit for modifications
                # in the bug-fixing commit to non-source code files.
                if repository.get_type(
                        path) not in repository.SOURCE_CODE_TYPES_TO_EXT:
                    continue

                bug_introducing_modifications.update(
                    thread_local.git.get_commits_last_modified_lines(
                        commit,
                        modification=modification,
                        hashes_to_ignore_path=os.path.realpath(
                            "git_hashes_to_ignore"),
                    ))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            tuple(git_to_mercurial([bug_introducing_hash]))[0],
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        def compress_and_upload():
            zstd_compress(db_path)
            db.upload(db_path)

        workers = os.cpu_count() + 1
        logger.info(
            f"Analyzing {len(bug_fixing_commits)} commits using {workers} workers..."
        )

        with concurrent.futures.ThreadPoolExecutor(
                initializer=_init, initargs=(repo_dir, ),
                max_workers=workers) as executor:

            def results():
                start_time = time.monotonic()

                futures = {
                    executor.submit(find_bic, bug_fixing_commit):
                    bug_fixing_commit["rev"]
                    for bug_fixing_commit in bug_fixing_commits
                }

                for future in tqdm(
                        concurrent.futures.as_completed(futures),
                        total=len(futures),
                ):
                    exc = future.exception()
                    if exc is not None:
                        logger.info(
                            f"Exception {exc} while analyzing {futures[future]}"
                        )
                        for f in futures:
                            f.cancel()

                    result = future.result()
                    if result is not None:
                        yield from result

                    if time.monotonic() - start_time >= 3600:
                        compress_and_upload()
                        start_time = time.monotonic()

            db.append(db_path, results())

        compress_and_upload()
Exemplo n.º 18
0
def get_push_data(
    granularity: str,
) -> Tuple[Callable[[], Iterator[PushResult]], int, Tuple[Runnable, ...]]:
    if granularity == "label":
        push_data_db = PUSH_DATA_LABEL_DB
    elif granularity == "group":
        push_data_db = PUSH_DATA_GROUP_DB
    elif granularity == "config_group":
        push_data_db = PUSH_DATA_CONFIG_GROUP_DB

    assert db.download(push_data_db)

    # In the last 28 pushes, we definitely run all possible runnables.
    push_data_count = 0
    push_data_queue: Deque[PushResult] = collections.deque(maxlen=28)
    for elem in db.read(push_data_db):
        push_data_count += 1

        push_data_queue.append(elem)

    logger.info(f"push data nodes: {push_data_count}")

    push_data = [(
        revisions,
        fix_revision,
        rename_runnables(granularity, push_tasks),
        rename_runnables(granularity, possible_regressions),
        rename_runnables(granularity, likely_regressions),
    ) for revisions, fix_revision, push_tasks, possible_regressions,
                 likely_regressions in push_data_queue]

    if granularity == "config_group":
        all_groups_set = set(
            sum(
                ([Group(r[1]) for r in push_runnables]
                 for _, _, push_runnables, _, _ in push_data),
                [],
            ))
        # Filter runnables we don't need.
        all_groups = filter_runnables(tuple(all_groups_set),
                                      cast(Set[Runnable], all_groups_set),
                                      "group")
        all_groups_set = set(all_groups)
        logger.info(
            f"{len(all_groups_set)} manifests run in the last 28 pushes")

    all_runnables_set = set(
        sum((list(push_runnables) for _, _, push_runnables, _, _ in push_data),
            []))
    # Filter runnables we don't need.
    all_runnables = filter_runnables(tuple(all_runnables_set),
                                     all_runnables_set, granularity)
    all_runnables_set = set(all_runnables)
    logger.info(
        f"{len(all_runnables_set)} runnables run in the last 28 pushes")

    def push_data_iter() -> Iterator[PushResult]:
        return ((
            revisions,
            fix_revision,
            filter_runnables(
                rename_runnables(granularity, push_tasks),
                all_runnables_set,
                granularity,
            ),
            filter_runnables(
                rename_runnables(granularity, possible_regressions),
                all_runnables_set,
                granularity,
            ),
            filter_runnables(
                rename_runnables(granularity, likely_regressions),
                all_runnables_set,
                granularity,
            ),
        ) for revisions, fix_revision, push_tasks, possible_regressions,
                likely_regressions in db.read(push_data_db))

    if granularity == "config_group":
        manifest_combinations = sum(
            sum(1 for _ in itertools.combinations(sorted(group_tasks), 2))
            for manifest, group_tasks in itertools.groupby(
                sorted(all_runnables, key=lambda x: x[1]), key=lambda x: x[1]))

        print(
            f"{manifest_combinations} possible combinations of manifests on configurations"
        )

    return push_data_iter, push_data_count, all_runnables
Exemplo n.º 19
0
def evaluate(bug_introducing_commits):
    logger.info("Downloading commits database...")
    assert db.download(repository.COMMITS_DB)

    logger.info("Downloading bugs database...")
    assert db.download(bugzilla.BUGS_DB)

    logger.info("Building bug -> commits map...")
    bug_to_commits_map = defaultdict(list)
    for commit in tqdm(repository.get_commits()):
        bug_to_commits_map[commit["bug_id"]].append(commit["node"])

    logger.info("Loading known regressors using regressed-by information...")
    known_regressors = {}
    for bug in tqdm(bugzilla.get_bugs()):
        if bug["regressed_by"]:
            known_regressors[bug["id"]] = bug["regressed_by"]
    logger.info(f"Loaded {len(known_regressors)} known regressors")

    fix_to_regressors_map = defaultdict(list)
    for bug_introducing_commit in bug_introducing_commits:
        if not bug_introducing_commit["bug_introducing_rev"]:
            continue

        fix_to_regressors_map[bug_introducing_commit["bug_fixing_rev"]].append(
            bug_introducing_commit["bug_introducing_rev"])

    logger.info(f"{len(fix_to_regressors_map)} fixes linked to regressors")
    logger.info(
        f"{sum(len(regressors) for regressors in fix_to_regressors_map.values())} regressors linked to fixes"
    )

    logger.info(
        "Measuring how many known regressors SZZ was able to find correctly..."
    )
    all_regressors = 0
    perfect_regressors = 0
    found_regressors = 0
    misassigned_regressors = 0
    for bug_id, regressor_bugs in tqdm(known_regressors.items()):
        # Get all commits which fixed the bug.
        fix_commits = bug_to_commits_map[
            bug_id] if bug_id in bug_to_commits_map else []
        if len(fix_commits) == 0:
            continue

        # Skip bug/regressor when we didn't analyze the commits to fix the bug (as
        # certainly we can't have found the regressor in this case).
        if not any(fix_commit in fix_to_regressors_map
                   for fix_commit in fix_commits):
            continue

        # Get all commits linked to the regressor bug.
        regressor_commits = []
        for regressor_bug in regressor_bugs:
            if regressor_bug not in bug_to_commits_map:
                continue

            regressor_commits += (
                commit for commit in bug_to_commits_map[regressor_bug])

        if len(regressor_commits) == 0:
            continue

        found_good = False
        found_bad = False
        for fix_commit in fix_commits:
            # Check if we found at least a correct regressor.
            if any(regressor_commit in regressor_commits
                   for regressor_commit in fix_to_regressors_map[fix_commit]):
                found_good = True

            # Check if we found at least a wrong regressor.
            if any(regressor_commit not in regressor_commits
                   for regressor_commit in fix_to_regressors_map[fix_commit]):
                found_bad = True

        all_regressors += 1

        if found_good and not found_bad:
            perfect_regressors += 1
        if found_good:
            found_regressors += 1
        if found_bad:
            misassigned_regressors += 1

    logger.info(
        f"Perfectly found {perfect_regressors} regressors out of {all_regressors}"
    )
    logger.info(f"Found {found_regressors} regressors out of {all_regressors}")
    logger.info(
        f"Misassigned {misassigned_regressors} regressors out of {all_regressors}"
    )
Exemplo n.º 20
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        # Analyze up to 500 commits at a time, to avoid the task running out of time.
        done = True
        if len(bug_fixing_commits) > 500:
            bug_fixing_commits = bug_fixing_commits[-500:]
            done = False

        with open("done", "w") as f:
            f.write(str(1 if done else 0))

        def _init(git_repo_dir):
            global GIT_REPO
            GIT_REPO = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            logger.info(f"Analyzing {git_fix_revision}...")

            commit = GIT_REPO.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                return [None]

            bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))
            logger.info(bug_introducing_modifications)

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:
            bug_introducing_commits = executor.map(find_bic,
                                                   bug_fixing_commits)
            bug_introducing_commits = tqdm(bug_introducing_commits,
                                           total=len(bug_fixing_commits))
            bug_introducing_commits = list(
                itertools.chain.from_iterable(bug_introducing_commits))

        total_results_num = len(bug_introducing_commits)
        bug_introducing_commits = list(filter(None, bug_introducing_commits))
        logger.info(
            f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
        )

        db.append(db_path, bug_introducing_commits)
        compress_file(db_path)
Exemplo n.º 21
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["revs"][0]
        else:
            last_node = None

        past_failures = shelve.Shelf(
            LMDBDict("data/past_failures.lmdb"),
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 20:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for task in tasks_to_consider:
                    is_regression = (task in possible_regressions
                                     or task in likely_regressions)

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     merged_commits["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     merged_commits["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        merged_commits["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        merged_commits["components"],
                        push_num,
                        is_regression,
                    )

                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)

                        yield {
                            "revs": revisions,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in possible_regressions,
                            "is_likely_regression": task in likely_regressions,
                        }

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Exemplo n.º 22
0
        model_class = DevDocNeededModel
    elif args.goal == "assignee":
        from bugbug.models.assignee import AssigneeModel

        model_class = AssigneeModel
    elif args.goal == "backout":
        from bugbug.models.backout import BackoutModel

        model_class = BackoutModel
    elif args.goal == "bugtype":
        from bugbug.models.bugtype import BugTypeModel

        model_class = BugTypeModel

    if args.train:
        db.download()

        if args.historical:
            model = model_class(args.lemmatization, args.historical)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )
Exemplo n.º 23
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        try:
            with open("data/past_failures.pickle", "rb") as f:
                past_failures, push_num = pickle.load(f)
        except FileNotFoundError:
            past_failures = {}
            push_num = 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            if type_ not in past_failures:
                past_failures[type_] = {}

            if task not in past_failures[type_]:
                past_failures[type_][task] = {}

            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            for item in items:
                if item not in past_failures[type_][task]:
                    past_failures[type_][task][item] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )

                value = past_failures[type_][task][item][push_num]

                values_total.append(value)
                values_prev_7.append(
                    value - past_failures[type_][task][item][push_num - 7]
                )
                values_prev_14.append(
                    value - past_failures[type_][task][item][push_num - 14]
                )
                values_prev_28.append(
                    value - past_failures[type_][task][item][push_num - 28]
                )
                values_prev_56.append(
                    value - past_failures[type_][task][item][push_num - 56]
                )

                if is_regression:
                    past_failures[type_][task][item][push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open("data/past_failures.pickle", "wb") as f:
            pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL)

        zstd_compress("data/past_failures.pickle")