Python download_commits示例，bugbug.repository.download_commits Python示例

示例#1

0

显示文件

    def __init__(self, repo_dir: str) -> None:
        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = download_and_load_model("regressor")

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

示例#2

0

显示文件

文件： commit_retriever.py 项目： rock420/bugbug

    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        with hglib.open(self.repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        chunk_size = 70000

        for i in range(0, len(revs), chunk_size):
            repository.download_commits(self.repo_dir,
                                        revs=revs[i:(i + chunk_size)])

        logger.info("commit data extracted from repository")

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        zstd_compress(repository.COMMITS_DB)
        create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))

示例#3

0

显示文件

文件： commit_retriever.py 项目： mythmon/bugbug

    def retrieve_commits(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6
        )
        repository.download_commits(self.repo_dir, two_years_and_six_months_ago)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")

示例#4

0

显示文件

    def retrieve_commits(self):
        shared_dir = self.repo_dir + '-shared'
        cmd = hglib.util.cmdbuilder('robustcheckout',
                                    'https://hg.mozilla.org/mozilla-central',
                                    self.repo_dir,
                                    purge=True,
                                    sharebase=shared_dir,
                                    networkattempts=7,
                                    branch=b'tip')

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info('mozilla-central cloned')

        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        repository.download_commits(self.repo_dir,
                                    two_years_and_six_months_ago)

        logger.info('commit data extracted from repository')

        self.compress_file('data/commits.json')

示例#5

0

显示文件

    def __init__(self, repo_dir: str) -> None:
        if not os.path.exists(repo_dir):
            repository.clone(repo_dir)
        else:
            repository.pull(repo_dir, "mozilla-central", "tip")

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

示例#6

0

显示文件

文件： boot.py 项目： ahal/bugbug

def boot_worker():
    # Preload models
    bugbug_http.models.preload_models()

    # Clone mozilla central
    repo_dir = os.environ.get("BUGBUG_REPO_DIR",
                              os.path.join(tempfile.gettempdir(), "bugbug-hg"))
    logger.info(f"Cloning mozilla-central in {repo_dir}...")
    repository.clone(repo_dir)

    # Download databases
    logger.info("Downloading test scheduling DB support file...")
    assert (db.download_support_file(
        test_scheduling.TEST_LABEL_SCHEDULING_DB,
        test_scheduling.PAST_FAILURES_LABEL_DB,
    ) or ALLOW_MISSING_MODELS)

    # Download commits DB
    logger.info("Downloading commits DB...")
    commits_db_downloaded = db.download(repository.COMMITS_DB,
                                        support_files_too=True)
    if not ALLOW_MISSING_MODELS:
        assert commits_db_downloaded

    if commits_db_downloaded:
        # And update it
        logger.info("Browsing all commits...")
        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])
        logger.info("Updating commits DB...")
        repository.download_commits(repo_dir, rev_start)

    logger.info("Worker boot done")

示例#7

0

显示文件

    def update_commit_db(self):
        repository.clone(self.repo_dir, update=True)

        assert db.download(repository.COMMITS_DB, support_files_too=True)

        for commit in repository.get_commits():
            pass

        repository.download_commits(self.repo_dir,
                                    rev_start="children({})".format(
                                        commit["node"]))

示例#8

0

显示文件

    def classify(
        self,
        revision=None,
        phabricator_deployment=None,
        diff_id=None,
        runnable_jobs_path=None,
    ):
        if revision is not None:
            assert phabricator_deployment is None
            assert diff_id is None

        if diff_id is not None:
            assert phabricator_deployment is not None
            assert revision is None

        self.update_commit_db()

        if phabricator_deployment is not None and diff_id is not None:
            with hglib.open(self.repo_dir) as hg:
                self.apply_phab(hg, phabricator_deployment, diff_id)

                revision = hg.log(
                    revrange="not public()")[0].node.decode("utf-8")

            commits = repository.download_commits(
                self.repo_dir,
                rev_start=revision,
                save=False,
                use_single_process=self.use_single_process,
            )
        else:
            commits = []

            for commit in repository.get_commits():
                if commit["node"] == revision:
                    commits.append(commit)
                    break

            # The commit to analyze was not in our DB, let's mine it.
            if len(commits) == 0:
                commits = repository.download_commits(
                    self.repo_dir,
                    revs=[revision],
                    save=False,
                    use_single_process=self.use_single_process,
                )

        assert len(commits) > 0, "There are no commits to analyze"

        if not self.use_test_history:
            self.classify_regressor(commits)
        else:
            self.classify_test_select(commits, runnable_jobs_path)

示例#9

0

显示文件

文件： boot.py 项目： mvkski/bugbug

def boot_worker():
    # Clone autoland
    logger.info(f"Cloning mozilla autoland in {REPO_DIR}...")
    repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland")

    # Download test scheduling DB support files.
    logger.info("Downloading test scheduling DB support files...")
    assert (
        db.download_support_file(
            test_scheduling.TEST_LABEL_SCHEDULING_DB,
            test_scheduling.PAST_FAILURES_LABEL_DB,
        )
        or ALLOW_MISSING_MODELS
    )

    assert (
        db.download_support_file(
            test_scheduling.TEST_GROUP_SCHEDULING_DB,
            test_scheduling.PAST_FAILURES_GROUP_DB,
        )
        or ALLOW_MISSING_MODELS
    )

    assert (
        db.download_support_file(
            test_scheduling.TEST_GROUP_SCHEDULING_DB,
            test_scheduling.TOUCHED_TOGETHER_DB,
        )
        or ALLOW_MISSING_MODELS
    )

    # Download commits DB
    logger.info("Downloading commits DB...")
    commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True)
    if not ALLOW_MISSING_MODELS:
        assert commits_db_downloaded

    if commits_db_downloaded:
        # And update it
        logger.info("Browsing all commits...")
        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])
        logger.info("Updating commits DB...")
        repository.download_commits(REPO_DIR, rev_start)

    # Preload models
    bugbug_http.models.preload_models()

    logger.info("Worker boot done")

示例#10

0

显示文件

文件： commit_classifier.py 项目： xiaoguizi87/bugbug

    def update_commit_db(self):
        repository.clone(self.repo_dir)

        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True, support_files_too=True)

        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])

        repository.download_commits(self.repo_dir, rev_start)

示例#11

0

显示文件

    def retrieve_commits(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        try:
            os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
        except FileNotFoundError:
            logger.info("pushlog database doesn't exist")

        # Pull and update, to make sure the pushlog is generated.
        hg = hglib.open(self.repo_dir)
        hg.pull(update=True)
        hg.close()

        db.download_version(repository.COMMITS_DB)
        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")

示例#12

0

显示文件

    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False)

        # We use "clean" (or "dirty") commits as the background dataset for feature importance.
        # This way, we can see the features which are most important in differentiating
        # the current commit from the "clean" (or "dirty") commits.

        if not self.use_test_history:
            probs, importance = self.model.classify(
                commits[-1],
                probabilities=True,
                importances=True,
                background_dataset=lambda v: self.X[self.y != v],
                importance_cutoff=0.05,
            )

            self.generate_feature_importance_data(probs, importance)

            with open("probs.json", "w") as f:
                json.dump(probs[0].tolist(), f)

            if self.model_name == "regressor" and self.method_defect_predictor_dir:
                self.classify_methods()
        else:
            # TODO: Should we consider a merge of the commits of the stack?
            commit = commits[-1]

            push_num = self.past_failures_data["push_num"]

            # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
            # XXX: Consider using the runnable jobs artifact from the Gecko Decision task.
            all_tasks = self.past_failures_data["all_tasks"]

            selected_tasks = []
            # TODO: Classify multiple commit/test at the same time.
            for data in test_scheduling.generate_data(self.past_failures_data,
                                                      commit, push_num,
                                                      all_tasks, [], []):
                if not data["name"].startswith("test-"):
                    continue

                commit["test_job"] = data

                probs = self.model.classify(commit, probabilities=True)

                if probs[0][1] > 0.9:
                    selected_tasks.append(data["name"])

            with open("selected_tasks", "w") as f:
                f.writelines(f"{selected_task}\n"
                             for selected_task in selected_tasks)

示例#13

0

显示文件

文件： commit_classifier.py 项目： harsh020/bugbug

    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir,
                rev_start=patch_rev.decode("utf-8"),
                ret=True,
                save=False)

        probs, importance = self.model.classify(commits[-1],
                                                probabilities=True,
                                                importances=True)

        feature_names = self.model.get_human_readable_feature_names()

        features = []
        for i, (val, feature_index,
                is_positive) in enumerate(importance["importances"]):
            features.append([
                i + 1,
                feature_names[int(feature_index)],
                f'({"+" if (is_positive) else "-"}{val})',
            ])

        with open("probs.json", "w") as f:
            json.dump(probs[0].tolist(), f)

        with open("importance.html", "w") as f:
            f.write(importance["html"])

示例#14

0

显示文件

    def get_bugs(self, date="today", bug_ids=[]):
        self.query_url = ""

        # Ignore already analyzed commits.
        for commit in repository.get_commits():
            pass

        rev_start = f"children({commit['node']})"

        commits = repository.download_commits(self.repo_dir,
                                              rev_start,
                                              ret=True)

        commits = [
            commit for commit in commits if not commit["ever_backedout"]
        ]

        probs = self.model.classify(commits, True)
        indexes = probs.argmax(axis=-1)

        result = {}
        for commit, prob, index in zip(commits, probs, indexes):
            result[commit["node"]] = {
                "id": commit["node"],
                "summary": commit["desc"].split("\n", 1)[0],
                "result": "Risky" if prob[1] > 0.5 else "Not risky",
                "confidence": nice_round(prob[index]),
            }

        return result

示例#15

0

显示文件

    def classify(
        self,
        revision=None,
        phabricator_deployment=None,
        diff_id=None,
        runnable_jobs_path=None,
    ):
        if revision is not None:
            assert phabricator_deployment is None
            assert diff_id is None

        if diff_id is not None:
            assert phabricator_deployment is not None
            assert revision is None

        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            if phabricator_deployment is not None and diff_id is not None:
                self.apply_phab(hg, phabricator_deployment, diff_id)

                revision = hg.log(
                    revrange="not public()")[0].node.decode("utf-8")

            # Analyze patch.
            commits = repository.download_commits(self.repo_dir,
                                                  rev_start=revision,
                                                  save=False)

        if not self.use_test_history:
            self.classify_regressor(commits)
        else:
            self.classify_test_select(commits, runnable_jobs_path)

示例#16

0

显示文件

    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        repository.download_commits(self.repo_dir, rev_start=rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress(repository.COMMITS_DB)
        create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))

示例#17

0

显示文件

文件： commit_retriever.py 项目： karmadolkar/bugbug

    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress("data/commits.json")
        zstd_compress("data/commit_experiences.pickle")

示例#18

0

显示文件

文件： commit_retriever.py 项目： villmow/bugbug

    def retrieve_commits(self):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")

示例#19

0

显示文件

文件： generate_landings_risk_report.py 项目： rajpratyush/bugbug

    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (
                parse_risk_band(risk_band)
                for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")
            ),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor"))
        )

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(
            get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
        )

示例#20

0

显示文件

    def retrieve_commits(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        try:
            os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
        except FileNotFoundError:
            logger.info("pushlog database doesn't exist")

        # Pull and update, to make sure the pushlog is generated.
        hg = hglib.open(self.repo_dir)
        hg.pull(update=True)
        hg.close()

        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6
        )
        repository.download_commits(self.repo_dir, two_years_and_six_months_ago)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")

示例#21

0

显示文件

文件： commit_retriever.py 项目： beatlse/bugbug

    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB) and not limit:
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            if limit:
                rev_start = (
                    -1 * limit
                )  # Mercurial revset support negative integers starting from tip
            else:
                rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress("data/commits.json")
        zstd_compress("data/commit_experiences.pickle")

示例#22

0

显示文件

def schedule_tests(branch, rev):
    from bugbug_http.app import JobInfo
    from bugbug_http import REPO_DIR

    job = JobInfo(schedule_tests, branch, rev)
    LOGGER.debug(f"Processing {job}")

    # Load the full stack of patches leading to that revision
    try:
        stack = get_hgmo_stack(branch, rev)
    except requests.exceptions.RequestException:
        LOGGER.warning(f"Push not found for {branch} @ {rev}!")
        return "NOK"

    # Apply the stack on the local repository
    try:
        revs = repository.apply_stack(REPO_DIR, stack, branch)
    except Exception as e:
        LOGGER.warning(f"Failed to apply stack {branch} @ {rev}: {e}")
        return "NOK"

    test_selection_threshold = float(
        os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.3))

    # Analyze patches.
    commits = repository.download_commits(REPO_DIR,
                                          revs=revs,
                                          save=False,
                                          use_single_process=True)

    tasks = MODEL_CACHE.get("testlabelselect").select_tests(
        commits, test_selection_threshold)

    reduced = MODEL_CACHE.get("testlabelselect").reduce(
        set(t for t, c in tasks.items() if c >= 0.7), 1.0)

    data = {
        "tasks":
        tasks,
        "groups":
        MODEL_CACHE.get("testgroupselect").select_tests(
            commits, test_selection_threshold),
        "reduced_tasks": {t: c
                          for t, c in tasks.items() if t in reduced},
    }
    setkey(job.result_key, orjson.dumps(data))

    return "OK"

示例#23

0

显示文件

    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False)

        probs, importance = self.model.classify(
            commits[-1],
            probabilities=True,
            importances=True,
            background_dataset=self.background_dataset,
        )

        features = []
        for i, (val, feature_index, is_positive) in enumerate(
                importance["importances"]["classes"][1][0]):
            features.append([
                i + 1,
                importance["feature_legend"][str(i + 1)],
                f'{"+" if (is_positive) else "-"}{val}',
            ])

        with open("probs.json", "w") as f:
            json.dump(probs[0].tolist(), f)

        with open("importances.json", "w") as f:
            json.dump(features, f)

        with open("importance.html", "w") as f:
            f.write(importance["html"])

示例#24

0

显示文件

def test_download_commits(fake_hg_repo):
    hg, local, remote = fake_hg_repo

    responses.add(
        responses.HEAD,
        "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json",
        status=200,
        headers={"ETag": "123"},
    )

    responses.add(
        responses.GET,
        "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json",
        status=200,
        json={
            "file1": ["Firefox", "Menus"],
            "file2": ["Firefox", "General"],
            "file3": ["Core", "General"],
        },
    )

    # Remove the mock DB generated by the mock_data fixture.
    os.remove("data/commits.json")

    with open(os.path.join(local, ".hg-annotate-ignore-revs"), "w") as f:
        f.write("not_existing_hash\n")

    add_file(hg, local, "file1", "1\n2\n3\n4\n5\n6\n7\n")
    commit(hg, date=datetime(1991, 4, 16, tzinfo=timezone.utc))
    hg.push(dest=bytes(remote, "ascii"))
    copy_pushlog_database(remote, local)

    commits = repository.download_commits(local)
    assert len(commits) == 0
    commits = list(repository.get_commits())
    assert len(commits) == 0

    # Wait one second, to have a different pushdate.
    time.sleep(1)

    add_file(hg, local, "file2", "1\n2\n3\n4\n5\n6\n7\n")
    revision2 = commit(hg, "Bug 123 - Prova. r=moz,rev2")
    hg.push(dest=bytes(remote, "ascii"))
    copy_pushlog_database(remote, local)

    commits = repository.download_commits(local)
    assert len(commits) == 1
    commits = list(repository.get_commits())
    assert len(commits) == 1
    assert commits[0]["node"] == revision2
    assert commits[0]["touched_prev_total_author_sum"] == 0
    assert commits[0]["seniority_author"] > 0

    # Wait one second, to have a different pushdate.
    time.sleep(1)

    add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n")
    revision3 = commit(hg, "Bug 456 - Prova. r=moz")
    hg.push(dest=bytes(remote, "ascii"))
    copy_pushlog_database(remote, local)

    commits = repository.download_commits(local, revision3)
    assert len(commits) == 1
    commits = list(repository.get_commits())
    assert len(commits) == 2
    assert commits[0]["node"] == revision2
    assert commits[0]["touched_prev_total_author_sum"] == 0
    assert commits[0]["seniority_author"] > 0
    assert commits[1]["node"] == revision3
    assert commits[1]["touched_prev_total_author_sum"] == 1
    assert commits[1]["seniority_author"] > commits[0]["seniority_author"]

    os.remove("data/commits.json")
    os.remove("data/commit_experiences.pickle")
    commits = repository.download_commits(local, f"children({revision2})")
    assert len(commits) == 1
    assert len(list(repository.get_commits())) == 1

    os.remove("data/commits.json")
    os.remove("data/commit_experiences.pickle")
    commits = repository.download_commits(local)
    assert len(list(repository.get_commits())) == 2

示例#25

0

显示文件

def schedule_tests(branch: str, rev: str) -> str:
    from bugbug_http.app import JobInfo
    from bugbug_http import REPO_DIR

    job = JobInfo(schedule_tests, branch, rev)
    LOGGER.info(f"Processing {job}...")

    # Pull the revision to the local repository
    LOGGER.info("Pulling commits from the remote repository...")
    repository.pull(REPO_DIR, branch, rev)

    # Load the full stack of patches leading to that revision
    LOGGER.info("Loading commits to analyze using automationrelevance...")
    try:
        revs = get_hgmo_stack(branch, rev)
    except requests.exceptions.RequestException:
        LOGGER.warning(f"Push not found for {branch} @ {rev}!")
        return "NOK"

    test_selection_threshold = float(
        os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.5)
    )

    # Analyze patches.
    commits = repository.download_commits(
        REPO_DIR, revs=revs, save=False, use_single_process=True, include_no_bug=True
    )

    if len(commits) > 0:
        testlabelselect_model = MODEL_CACHE.get("testlabelselect")
        testgroupselect_model = MODEL_CACHE.get("testgroupselect")

        tasks = testlabelselect_model.select_tests(commits, test_selection_threshold)

        reduced = testlabelselect_model.reduce(
            set(t for t, c in tasks.items() if c >= 0.8), 1.0
        )

        reduced_higher = testlabelselect_model.reduce(
            set(t for t, c in tasks.items() if c >= 0.9), 1.0
        )

        groups = testgroupselect_model.select_tests(commits, test_selection_threshold)

        config_groups = testgroupselect_model.select_configs(groups.keys(), 1.0)
    else:
        tasks = {}
        reduced = {}
        groups = {}
        config_groups = {}

    data = {
        "tasks": tasks,
        "groups": groups,
        "config_groups": config_groups,
        "reduced_tasks": {t: c for t, c in tasks.items() if t in reduced},
        "reduced_tasks_higher": {t: c for t, c in tasks.items() if t in reduced_higher},
        "known_tasks": get_known_tasks(),
    }
    setkey(job.result_key, orjson.dumps(data), compress=True)

    return "OK"

示例#26

0

显示文件

文件： generate_landings_risk_report.py 项目： Ayushsunny/bugbug

    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (parse_risk_band(risk_band)
             for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor")))

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

        self.path_to_component = repository.get_component_mapping()

        self.past_regressions_by = {}
        self.past_fixed_bugs_by = {}
        self.past_regression_blocked_bugs_by = {}
        self.past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            self.past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension))
            self.past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension))
            self.past_regression_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
            self.past_fixed_bug_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))

示例#27

0

显示文件

def boot_worker():
    # Clone autoland
    def clone_autoland():
        logger.info(f"Cloning autoland in {REPO_DIR}...")
        repository.clone(REPO_DIR,
                         "https://hg.mozilla.org/integration/autoland")

    def extract_past_failures_label():
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB))
            logger.info("Label-level past failures DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Label-level past failures DB not extracted, but missing models are allowed."
            )

    def extract_failing_together():
        try:
            utils.extract_file(
                os.path.join("data",
                             test_scheduling.FAILING_TOGETHER_LABEL_DB))
            logger.info("Failing together DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Failing together DB not extracted, but missing models are allowed."
            )

    def extract_past_failures_group():
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB))
            logger.info("Group-level past failures DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Group-level past failures DB not extracted, but missing models are allowed."
            )

    def extract_touched_together():
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB))
            logger.info("Touched together DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Touched together DB not extracted, but missing models are allowed."
            )

    def extract_commits():
        try:
            utils.extract_file(f"{repository.COMMITS_DB}.zst")
            logger.info("Commits DB extracted.")
            return True
        except FileNotFoundError:
            logger.info(
                "Commits DB not extracted, but missing models are allowed.")
            assert ALLOW_MISSING_MODELS
            return False

    def extract_commit_experiences():
        try:
            utils.extract_file(
                os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
            logger.info("Commit experiences DB extracted.")
        except FileNotFoundError:
            logger.info(
                "Commit experiences DB not extracted, but missing models are allowed."
            )
            assert ALLOW_MISSING_MODELS

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(7),
        wait=tenacity.wait_exponential(multiplier=1, min=1, max=8),
    )
    def retrieve_schedulable_tasks():
        # Store in a file the list of tasks in the latest autoland push.
        r = requests.get(
            "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.latest.taskgraph.decision/artifacts/public/target-tasks.json"
        )
        r.raise_for_status()
        with open("known_tasks", "w") as f:
            f.write("\n".join(r.json()))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        clone_autoland_future = executor.submit(clone_autoland)

        retrieve_schedulable_tasks_future = executor.submit(
            retrieve_schedulable_tasks)

        commits_db_extracted = extract_commits()
        extract_commit_experiences()
        extract_touched_together()
        extract_past_failures_label()
        extract_past_failures_group()
        extract_failing_together()

        if commits_db_extracted:
            # Update the commits DB.
            logger.info("Browsing all commits...")
            for commit in repository.get_commits():
                pass
            logger.info("All commits browsed.")

            # Wait repository to be cloned, as it's required to call repository.download_commits.
            logger.info("Waiting autoland to be cloned...")
            clone_autoland_future.result()

            rev_start = "children({})".format(commit["node"])
            logger.info("Updating commits DB...")
            commits = repository.download_commits(REPO_DIR,
                                                  rev_start,
                                                  use_single_process=True)
            logger.info("Commits DB updated.")

            logger.info("Updating touched together DB...")
            if len(commits) > 0:
                # Update the touched together DB.
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

                update_touched_together_gen.send(commits[-1]["node"])

                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass
            logger.info("Touched together DB updated.")

        # Wait list of schedulable tasks to be downloaded and written to disk.
        retrieve_schedulable_tasks_future.result()

    logger.info("Worker boot done")

示例#28

0

显示文件

def schedule_tests(branch, rev):
    from bugbug_http.app import JobInfo
    from bugbug_http import REPO_DIR

    job = JobInfo(schedule_tests, branch, rev)
    LOGGER.debug(f"Processing {job}")

    # Load the full stack of patches leading to that revision
    try:
        stack = get_hgmo_stack(branch, rev)
    except requests.exceptions.RequestException:
        LOGGER.warning(f"Push not found for {branch} @ {rev}!")
        return "NOK"

    # Apply the stack on the local repository
    try:
        revs = repository.apply_stack(REPO_DIR, stack, branch)
    except Exception as e:
        LOGGER.warning(f"Failed to apply stack {branch} @ {rev}: {e}")
        return "NOK"

    test_selection_threshold = float(
        os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.3))

    # Analyze patches.
    commits = repository.download_commits(REPO_DIR,
                                          revs=revs,
                                          save=False,
                                          use_single_process=True)

    commit_data = commit_features.merge_commits(commits)

    def get_runnables(granularity):
        past_failures_data = test_scheduling.get_past_failures(granularity)

        push_num = past_failures_data["push_num"]
        all_runnables = past_failures_data["all_runnables"]

        commit_tests = []
        for data in test_scheduling.generate_data(past_failures_data,
                                                  commit_data, push_num,
                                                  all_runnables, [], []):
            if granularity == "label" and not data["name"].startswith("test-"):
                continue

            commit_test = commit_data.copy()
            commit_test["test_job"] = data
            commit_tests.append(commit_test)

        probs = MODEL_CACHE.get(f"test{granularity}select").classify(
            commit_tests, probabilities=True)
        selected_indexes = np.argwhere(
            probs[:, 1] > test_selection_threshold)[:, 0]
        return {
            commit_tests[i]["test_job"]["name"]:
            math.floor(probs[i, 1] * 100) / 100
            for i in selected_indexes
        }

    data = {
        "tasks": get_runnables("label"),
        "groups": get_runnables("group"),
    }
    setkey(job.result_key, orjson.dumps(data))

    return "OK"

示例#29

0

显示文件

def boot_worker():
    # Clone autoland
    def clone_autoland():
        logger.info(f"Cloning autoland in {REPO_DIR}...")
        repository.clone(REPO_DIR,
                         "https://hg.mozilla.org/integration/autoland")

    def extract_past_failures_label():
        try:
            utils.extract_file(test_scheduling.PAST_FAILURES_LABEL_DB)
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS

        logger.info("Label-level past failures DB extracted.")

    def extract_past_failures_group():
        try:
            utils.extract_file(test_scheduling.PAST_FAILURES_GROUP_DB)
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS

        logger.info("Group-level past failures DB extracted.")

    def extract_touched_together():
        try:
            utils.extract_file(test_scheduling.TOUCHED_TOGETHER_DB)
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS

        logger.info("Touched together DB extracted.")

    def extract_commits():
        try:
            utils.extract_file(f"{repository.COMMITS_DB}.zst")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            return False

        logger.info("Commits DB extracted.")
        return True

    def extract_commit_experiences():
        try:
            utils.extract_file(repository.COMMIT_EXPERIENCES_DB)
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS

        logger.info("Commit experiences DB extracted.")

    with concurrent.futures.ThreadPoolExecutor() as executor:
        clone_autoland_future = executor.submit(clone_autoland)

        commits_db_extracted = extract_commits()
        extract_commit_experiences()
        extract_touched_together()
        extract_past_failures_label()
        extract_past_failures_group()

        if commits_db_extracted:
            # Update the commits DB.
            logger.info("Browsing all commits...")
            for commit in repository.get_commits():
                pass
            logger.info("All commits browsed.")

            # Wait repository to be cloned, as it's required to call repository.download_commits.
            logger.info("Waiting autoland to be cloned...")
            clone_autoland_future.result()

            rev_start = "children({})".format(commit["node"])
            logger.info("Updating commits DB...")
            commits = repository.download_commits(REPO_DIR,
                                                  rev_start,
                                                  use_single_process=True)
            logger.info("Commits DB updated.")

            logger.info("Updating touched together DB...")
            if len(commits) > 0:
                # Update the touched together DB.
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

                update_touched_together_gen.send(commits[-1]["node"])

                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass
            logger.info("Touched together DB updated.")

    logger.info("Worker boot done")

示例#30

0

显示文件

文件： commit_classifier.py 项目： arjun-krishna/bugbug

    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False)

        # We use "clean" (or "dirty") commits as the background dataset for feature importance.
        # This way, we can see the features which are most important in differentiating
        # the current commit from the "clean" (or "dirty") commits.

        probs, importance = self.model.classify(
            commits[-1],
            probabilities=True,
            importances=True,
            background_dataset=lambda v: self.X[self.y != v],
            importance_cutoff=0.05,
        )

        pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0]

        features = []
        for i, (val, feature_index, is_positive) in enumerate(
                importance["importances"]["classes"][pred_class][0]):
            value = importance["importances"]["values"][0, int(feature_index)]

            X = self.X[:, int(feature_index)]
            y = self.y[X != 0]
            X = X[X != 0]
            spearman = spearmanr(X, y)

            buggy_X = X[y == 1]
            clean_X = X[y == 0]
            median = np.median(X)
            median_clean = np.median(clean_X)
            median_buggy = np.median(buggy_X)

            perc_buggy_values_higher_than_median = (
                buggy_X >= median).sum() / buggy_X.shape[0]
            perc_buggy_values_lower_than_median = (
                buggy_X < median).sum() / buggy_X.shape[0]
            perc_clean_values_higher_than_median = (
                clean_X > median).sum() / clean_X.shape[0]
            perc_clean_values_lower_than_median = (
                clean_X <= median).sum() / clean_X.shape[0]

            logger.info("Feature: {}".format(
                importance["feature_legend"][str(i + 1)]))
            logger.info("Shap value: {}{}".format(
                "+" if (is_positive) else "-", val))
            logger.info(f"spearman:  {spearman}")
            logger.info(f"value: {value}")
            logger.info(f"overall mean: {np.mean(X)}")
            logger.info(f"overall median: {np.median(X)}")
            logger.info(f"mean for y == 0: {np.mean(clean_X)}")
            logger.info(f"mean for y == 1: {np.mean(buggy_X)}")
            logger.info(f"median for y == 0: {np.median(clean_X)}")
            logger.info(f"median for y == 1: {np.median(buggy_X)}")
            logger.info(
                f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}"
            )
            logger.info(
                f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}"
            )
            logger.info(
                f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}"
            )
            logger.info(
                f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}"
            )

            features.append({
                "index":
                i + 1,
                "name":
                importance["feature_legend"][str(i + 1)],
                "shap":
                float(f'{"+" if (is_positive) else "-"}{val}'),
                "value":
                importance["importances"]["values"][0, int(feature_index)],
                "spearman":
                spearman,
                "median":
                median,
                "median_bug_introducing":
                median_buggy,
                "median_clean":
                median_clean,
                "perc_buggy_values_higher_than_median":
                perc_buggy_values_higher_than_median,
                "perc_buggy_values_lower_than_median":
                perc_buggy_values_lower_than_median,
                "perc_clean_values_higher_than_median":
                perc_clean_values_higher_than_median,
                "perc_clean_values_lower_than_median":
                perc_clean_values_lower_than_median,
            })

        # Group together features that are very similar to each other, so we can simplify the explanation
        # to users.
        attributes = ["Total", "Maximum", "Minimum", "Average"]
        already_added = set()
        feature_groups = []
        for i1, f1 in enumerate(features):
            if i1 in already_added:
                continue

            feature_groups.append([f1])

            for j, f2 in enumerate(features[i1 + 1:]):
                i2 = j + i1 + 1

                f1_name = f1["name"]
                for attribute in attributes:
                    if f1_name.startswith(attribute):
                        f1_name = f1_name[len(attribute) + 1:]
                        break

                f2_name = f2["name"]
                for attribute in attributes:
                    if f2_name.startswith(attribute):
                        f2_name = f2_name[len(attribute) + 1:]
                        break

                if f1_name != f2_name:
                    continue

                already_added.add(i2)
                feature_groups[-1].append(f2)

        # Pick a representative example from each group.
        features = []
        for feature_group in feature_groups:
            shap = sum(f["shap"] for f in feature_group)

            # Only select easily explainable features from the group.
            selected = [
                f for f in feature_group
                if (f["shap"] > 0 and abs(f["value"] -
                                          f["median_bug_introducing"]) <
                    abs(f["value"] - f["median_clean"])) or (
                        f["shap"] < 0 and abs(f["value"] - f["median_clean"]) <
                        abs(f["value"] - f["median_bug_introducing"]))
            ]

            # If there are no easily explainable features in the group, select all features of the group.
            if len(selected) == 0:
                selected = feature_group

            def feature_sort_key(f):
                if f["shap"] > 0 and f["spearman"][0] > 0:
                    return f["perc_buggy_values_higher_than_median"]
                elif f["shap"] > 0 and f["spearman"][0] < 0:
                    return f["perc_buggy_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] > 0:
                    return f["perc_clean_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] < 0:
                    return f["perc_clean_values_higher_than_median"]

            feature = max(selected, key=feature_sort_key)
            feature["shap"] = shap

            for attribute in attributes:
                if feature["name"].startswith(attribute):
                    feature["name"] = feature["name"][len(attribute) +
                                                      1:].capitalize()
                    break

            features.append(feature)

        with open("probs.json", "w") as f:
            json.dump(probs[0].tolist(), f)

        with open("importances.json", "w") as f:
            json.dump(features, f)

        # Get commit hash from 4 months before the analysis time.
        # The method-level analyzer needs 4 months of history.
        four_months_ago = datetime.utcnow() - relativedelta(months=4)
        p = subprocess.run(
            [
                "git",
                "rev-list",
                "-n",
                "1",
                "--until={}".format(four_months_ago.strftime("%Y-%m-%d")),
                "HEAD",
            ],
            check=True,
            capture_output=True,
            cwd=self.git_repo_dir,
        )

        stop_hash = p.stdout.decode().strip()

        # Run the method-level analyzer.
        subprocess.run(
            [
                "python3",
                "tester.py",
                "--repo",
                self.git_repo_dir,
                "--start",
                "HEAD",
                "--stop",
                stop_hash,
                "--output",
                os.path.abspath("method_level.csv"),
            ],
            check=True,
            cwd=self.method_defect_predictor_dir,
        )

        method_level_results = []
        try:
            with open("method_level.csv", "r") as f:
                reader = csv.DictReader(f)
                for item in reader:
                    method_level_results.append(item)
        except FileNotFoundError:
            # No methods were classified.
            pass

        with open("method_level.json", "w") as f:
            json.dump(method_level_results, f)

示例#31

0

显示文件

文件： boot.py 项目： rock420/bugbug

def boot_worker() -> None:
    # Clone autoland
    def clone_autoland() -> None:
        logger.info(f"Cloning autoland in {REPO_DIR}...")
        repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland")

    def extract_past_failures_label() -> None:
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB)
            )
            logger.info("Label-level past failures DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Label-level past failures DB not extracted, but missing models are allowed."
            )

    def extract_failing_together_label() -> None:
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.FAILING_TOGETHER_LABEL_DB)
            )
            logger.info("Failing together label DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Failing together label DB not extracted, but missing models are allowed."
            )

    def extract_failing_together_config_group() -> None:
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB)
            )
            logger.info("Failing together config/group DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Failing together config/group DB not extracted, but missing models are allowed."
            )

    def extract_past_failures_group() -> None:
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB)
            )
            logger.info("Group-level past failures DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Group-level past failures DB not extracted, but missing models are allowed."
            )

    def extract_touched_together() -> None:
        try:
            utils.extract_file(
                os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB)
            )
            logger.info("Touched together DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Touched together DB not extracted, but missing models are allowed."
            )

    def extract_commits() -> bool:
        try:
            utils.extract_file(f"{repository.COMMITS_DB}.zst")
            logger.info("Commits DB extracted.")
            return True
        except FileNotFoundError:
            logger.info("Commits DB not extracted, but missing models are allowed.")
            assert ALLOW_MISSING_MODELS
            return False

    def extract_commit_experiences() -> None:
        try:
            utils.extract_file(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
            logger.info("Commit experiences DB extracted.")
        except FileNotFoundError:
            assert ALLOW_MISSING_MODELS
            logger.info(
                "Commit experiences DB not extracted, but missing models are allowed."
            )

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(7),
        wait=tenacity.wait_exponential(multiplier=1, min=1, max=8),
    )
    def retrieve_schedulable_tasks() -> None:
        r = requests.get(
            "https://hg.mozilla.org/integration/autoland/json-pushes?version=2&tipsonly=1"
        )
        r.raise_for_status()
        revs = [
            push_obj["changesets"][0]
            for push_id, push_obj in r.json()["pushes"].items()
        ]

        logger.info(f"Retrieving known tasks from {revs}")

        # Store in a file the list of tasks in the latest autoland pushes.
        # We use more than one to protect ourselves from broken decision tasks.
        known_tasks = set()
        for rev in revs:
            r = requests.get(
                f"https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.revision.{rev}.taskgraph.decision/artifacts/public/target-tasks.json"
            )
            if r.ok:
                known_tasks.update(r.json())

        logger.info(f"Retrieved {len(known_tasks)} tasks")

        assert len(known_tasks) > 0

        with open("known_tasks", "w") as f:
            f.write("\n".join(known_tasks))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        clone_autoland_future = executor.submit(clone_autoland)

        retrieve_schedulable_tasks_future = executor.submit(retrieve_schedulable_tasks)

        commits_db_extracted = extract_commits()
        extract_commit_experiences()
        extract_touched_together()
        extract_past_failures_label()
        extract_past_failures_group()
        extract_failing_together_label()
        extract_failing_together_config_group()

        if commits_db_extracted:
            # Update the commits DB.
            logger.info("Browsing all commits...")
            nodes = collections.deque(
                (commit["node"] for commit in repository.get_commits()), maxlen=4096
            )
            nodes.reverse()
            logger.info("All commits browsed.")

            # Wait repository to be cloned, as it's required to call repository.download_commits.
            logger.info("Waiting autoland to be cloned...")
            clone_autoland_future.result()

            with hglib.open(REPO_DIR) as hg:
                # Try using nodes backwards, in case we have some node that was on central at the time
                # we mined commits, but is not yet on autoland.
                for node in nodes:
                    try:
                        revs = repository.get_revs(hg, rev_start=f"children({node})")
                        break
                    except hglib.error.CommandError as e:
                        if b"abort: unknown revision" not in e.err:
                            raise

            logger.info("Updating commits DB...")
            commits = repository.download_commits(
                REPO_DIR, revs=revs, use_single_process=True
            )
            logger.info("Commits DB updated.")

            logger.info("Updating touched together DB...")
            if len(commits) > 0:
                # Update the touched together DB.
                update_touched_together_gen = test_scheduling.update_touched_together()
                next(update_touched_together_gen)

                update_touched_together_gen.send(commits[-1]["node"])

                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass
            logger.info("Touched together DB updated.")

        # Wait list of schedulable tasks to be downloaded and written to disk.
        retrieve_schedulable_tasks_future.result()

    logger.info("Worker boot done")