Exemplo n.º 1
0
    def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir):
        self.model_name = model_name
        self.repo_dir = repo_dir

        self.model = download_and_load_model(model_name)
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "8cc47f47ffb686a29324435a0151b5fabd37f865",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_X_path)
            assert os.path.exists(model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_y_path)
            assert os.path.exists(model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

            past_bugs_by_function_path = "data/past_bugs_by_function.pickle"
            download_check_etag(
                PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst"
            )
            zstd_decompress(past_bugs_by_function_path)
            assert os.path.exists(past_bugs_by_function_path)
            with open(past_bugs_by_function_path, "rb") as f:
                self.past_bugs_by_function = pickle.load(f)

        if model_name == "testlabelselect":
            self.use_test_history = True
            assert db.download_support_file(
                test_scheduling.TEST_LABEL_SCHEDULING_DB,
                test_scheduling.PAST_FAILURES_LABEL_DB,
            )
            self.past_failures_data = test_scheduling.get_past_failures("label")

            self.testfailure_model = download_and_load_model("testfailure")
            assert self.testfailure_model is not None
Exemplo n.º 2
0
    def __init__(self, repo_dir: str) -> None:
        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = download_and_load_model("regressor")

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
Exemplo n.º 3
0
    def go(self, model_name):
        # Load the model
        model = download_and_load_model(model_name)

        # Then call the check method of the model
        success = model.check()

        if not success:
            msg = f"Check of model {model.__class__!r} failed, check the output for reasons why"
            logger.warning(msg)
            sys.exit(1)
Exemplo n.º 4
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download previous classifications...")
        db.download(BUG_FIXING_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        defect_model = download_and_load_model("defectenhancementtask")

        logger.info("Downloading regression model...")
        regression_model = download_and_load_model("regression")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)
        db.upload(BUG_FIXING_COMMITS_DB)
Exemplo n.º 5
0
    def __init__(
        self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir
    ):
        self.model_name = model_name
        self.cache_root = cache_root

        assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")

        self.model = download_and_load_model(model_name)
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_X_path)
            assert os.path.exists(model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_y_path)
            assert os.path.exists(model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

            past_bugs_by_function_path = "data/past_bugs_by_function.pickle"
            download_check_etag(
                PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst"
            )
            zstd_decompress(past_bugs_by_function_path)
            assert os.path.exists(past_bugs_by_function_path)
            with open(past_bugs_by_function_path, "rb") as f:
                self.past_bugs_by_function = pickle.load(f)

        if model_name == "testselect":
            self.use_test_history = True
            assert db.download_support_file(
                test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB
            )
            self.past_failures_data = test_scheduling.get_past_failures()

            self.testfailure_model = download_and_load_model("testfailure")
            assert self.testfailure_model is not None