def download_models(): for model_name in MODELS_NAMES: utils.download_model(model_name) # Try loading the model get_model(model_name) db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, extract=False, ) db.download_support_file(repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, extract=False) db.download(repository.COMMITS_DB, extract=False)
def classify_bugs(model_name: str, classifier: str, bug_id: int) -> None: if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_model(model_name) except requests.HTTPError: logger.error( "A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: assert db.download(bugzilla.BUGS_DB) bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def classify_issues(owner: str, repo: str, retrieve_events: bool, model_name: str, issue_number: int) -> None: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_model(model_name) except requests.HTTPError: logger.error( "A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) model_class = get_model_class(model_name) model = model_class.load(model_file_name) if issue_number: issues = iter([ github.fetch_issue_by_number(owner, repo, issue_number, retrieve_events) ]) assert issues, f"An issue with a number of {issue_number} was not found" else: assert db.download(github.GITHUB_ISSUES_DB) issues = github.get_issues() for issue in issues: print(f'{issue["url"]} - {issue["title"]} ') if model.calculate_importance: probas, importance = model.classify(issue, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(issue, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def download_models(): for model_name in MODELS_NAMES: utils.download_model(model_name) # Try loading the model try: m = MODEL_CACHE.get(model_name) m.download_eval_dbs(extract=False, ensure_exist=not ALLOW_MISSING_MODELS) except FileNotFoundError: if ALLOW_MISSING_MODELS: LOGGER.info( "Missing %r model, skipping because ALLOW_MISSING_MODELS is set" % model_name) return None else: raise
def go(self, model_name: str) -> None: # Load the model model = Model.load(download_model(model_name)) # Then call the check method of the model success = model.check() if not success: msg = f"Check of model {model.__class__!r} failed, check the output for reasons why" logger.warning(msg) sys.exit(1)
def download_models(): for model_name in MODELS_NAMES: utils.download_model(model_name) # Try loading the model try: MODEL_CACHE.get(model_name) except FileNotFoundError: if ALLOW_MISSING_MODELS: LOGGER.info( "Missing %r model, skipping because ALLOW_MISSING_MODELS is set" % model_name ) return None else: raise db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, extract=False, ) db.download_support_file( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, extract=False ) db.download(repository.COMMITS_DB, extract=False)
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))
def find_bug_fixing_commits(self) -> None: logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download previous classifications...") db.download(BUG_FIXING_COMMITS_DB) logger.info("Get previously classified commits...") prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)) logger.info( f"Already classified {len(prev_bug_fixing_commits_nodes)} commits..." ) # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") defect_model = cast( DefectEnhancementTaskModel, DefectEnhancementTaskModel.load( download_model("defectenhancementtask")), ) logger.info("Downloading regression model...") regression_model = cast( RegressionModel, RegressionModel.load(download_model("regression"))) start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs() -> Iterator[dict]: return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels, _ = defect_model.get_labels() known_regression_labels, _ = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id: int, type_: str) -> None: for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) db.upload(BUG_FIXING_COMMITS_DB)
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def preload_models(): for model_name in MODELS_NAMES: utils.download_model(model_name)