def plot_graphs(granularity: str) -> None: push_data_db = ( test_scheduling.PUSH_DATA_GROUP_DB if granularity == "group" else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB ) assert db.download(push_data_db) regressions_by_rev = {} for revisions, _, _, possible_regressions, likely_regressions in db.read( push_data_db ): regressions_by_rev[revisions[0]] = get_regressions( granularity, likely_regressions, possible_regressions ) scheduled_data = [] caught_data = [] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB): if len(scheduler_stat["schedulers"]) == 0: continue if scheduler_stat["id"] not in regressions_by_rev: continue obj: dict[str, Any] = { "date": datetime.utcfromtimestamp(scheduler_stat["date"]), } for scheduler in scheduler_stat["schedulers"]: obj[scheduler["name"]] = len(get_scheduled(granularity, scheduler)) scheduled_data.append(obj) regressions = regressions_by_rev[scheduler_stat["id"]] obj = { "date": datetime.utcfromtimestamp(scheduler_stat["date"]), "regressions": len(regressions), } for scheduler in scheduler_stat["schedulers"]: scheduled = get_scheduled(granularity, scheduler) obj[scheduler["name"]] = len(regressions & scheduled) caught_data.append(obj) scheduled_df = DataFrame(scheduled_data) scheduled_df.index = scheduled_df["date"] del scheduled_df["date"] caught_df = DataFrame(caught_data) caught_df.index = caught_df["date"] del caught_df["date"] df = scheduled_df.resample("W").mean() plot_graph( df, f"Average number of scheduled {granularity}s", f"average_{granularity}_scheduled.svg", ) df = ( caught_df[caught_df.regressions > 0] .drop(columns=["regressions"]) .clip(0, 1) .resample("W") .mean() ) plot_graph( df, "Percentage of regressing pushes where we caught at least one regression", f"percentage_{granularity}_caught_at_least_one.svg", ) plot_graph( caught_df.drop(columns=["regressions"]) .div(caught_df.regressions, axis=0) .resample("W") .mean(), "Percentage of regressions we caught", f"percentage_{granularity}_caught.svg", )
def go(months: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") known_scheduler_stats = { scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB) } logger.info( f"Already gathered statistics for {len(known_scheduler_stats)} pushes..." ) to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(months=months) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes = [push for push in pushes if push.rev not in known_scheduler_stats] logger.info(f"{len(pushes)} left to analyze") def compress_and_upload() -> None: utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB) with concurrent.futures.ThreadPoolExecutor() as executor: futures = tuple( executor.submit( analyze_shadow_schedulers, push, ) for push in pushes ) del pushes def results(): start_time = time.monotonic() try: for future in tqdm( concurrent.futures.as_completed(futures), total=len(futures), ): try: yield future.result() except Exception: traceback.print_exc() # Upload every 10 minutes. if time.monotonic() - start_time >= 600: compress_and_upload() start_time = time.monotonic() except Exception: for f in futures: f.cancel() raise db.append(SHADOW_SCHEDULER_STATS_DB, results()) compress_and_upload()
def retrieve_push_data(self): # Download previous cache. db.download(ADR_CACHE_DB) self.generate_push_data("label") self.generate_push_data("group") self.upload_adr_cache()
def main(args): model_file_name = "{}{}model".format( args.goal, "" if args.classifier == "default" else args.classifier) if args.goal == "component": if args.classifier == "default": model_class_name = "component" else: model_class_name = "component_nn" else: model_class_name = args.goal model_class = get_model_class(model_class_name) if args.train: db.download(bugzilla.BUGS_DB) db.download(repository.COMMITS_DB) historical_supported_tasks = [ "defect", "bugtype", "defectenhancementtask", "regression", ] if args.goal in historical_supported_tasks: model = model_class(args.lemmatization, args.historical) elif args.goal == "duplicate": model = model_class(args.training_set_size, args.lemmatization, args.cleanup_urls) else: model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) feature_names = model.get_human_readable_feature_names() model.print_feature_importances(importance["importances"], feature_names, class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert (args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
model_class = TrackingModel elif args.goal == 'qaneeded': from bugbug.models.qaneeded import QANeededModel model_class = QANeededModel elif args.goal == 'uplift': from bugbug.models.uplift import UpliftModel model_class = UpliftModel elif args.goal == 'component': from bugbug.models.component import ComponentModel model_class = ComponentModel elif args.goal == 'devdocneeded': from bugbug.models.devdocneeded import DevDocNeededModel model_class = DevDocNeededModel if args.train: db.download() model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print(f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ') probas, importances = model.classify(bug, probabilities=True, importances=True) feature_names = model.get_feature_names() for i, (index, is_positive, contrib) in enumerate(importances[:20]): print(f'{i + 1}. \'{feature_names[index]}\' ({"+" if is_positive else "-"}{contrib})')
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) logger.info("Downloading bugs database...") if db.is_old_version( bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) logger.info("Download previous classifications...") if db.is_old_version( BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB): db.download(BUG_FIXING_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") defect_model = DefectEnhancementTaskModel.load( "defectenhancementtaskmodel") logger.info("Downloading regression model...") download_model("regression") regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits return [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ["r", "d"] ]
def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst", f"{model_file_name}.zst", ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: assert db.download(bugzilla.BUGS_DB) bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") db.download(db_path) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): thread_local.git = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits bug_fixing_commits_queue = bug_fixing_commits.copy() with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: def results(): num_analyzed = 0 # Analyze up to 500 commits at a time, to avoid the task running out of time. while len( bug_fixing_commits_queue) != 0 and num_analyzed != 500: bug_introducing_commit_futures = [] for _ in range( min(500 - num_analyzed, len(bug_fixing_commits_queue))): bug_introducing_commit_futures.append( executor.submit(find_bic, bug_fixing_commits_queue.pop())) logger.info( f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits" ) for future in tqdm( concurrent.futures.as_completed( bug_introducing_commit_futures), total=len(bug_introducing_commit_futures), ): result = future.result() if result is not None: num_analyzed += 1 yield from result db.append(db_path, results()) zstd_compress(db_path) return len(bug_fixing_commits_queue) == 0
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info(f"Already found {len(prev_commits_to_ignore)} commits to ignore...") # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0 ) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"] ) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) # Drop commits which are not yet present in the mercurial <-> git mapping. while len(revs) > 0: try: vcs_map.mercurial_to_git(revs[-1].decode("ascii")) break except Exception as e: if not str(e).startswith("Missing mercurial commit in the VCS map"): raise revs.pop() commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append( { "rev": commit.node, "type": "backedout" if commit.backedoutby else "", } ) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info( "...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout") ) ) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def generate_test_scheduling_history(self, granularity): # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) assert db.download(push_data_db) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history(granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data push_data = list(db.read(push_data_db)) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 14 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-14:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 14 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity == "label": create_tar_zst(failing_together_db)
def find_bug_introducing_commits(cache_dir, git_repo_dir): mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central") logger.info("Downloading Mercurial <-> git mapping file...") vcs_map.download_mapfile() logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...") repository.clone(mercurial_repo_dir) logger.info(f"Cloning git repository to {git_repo_dir}...") clone_gecko_dev(git_repo_dir) logger.info("Download previously found bug-introducing commits...") db.download_version(BUG_INTRODUCING_COMMITS_DB) if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists( BUG_INTRODUCING_COMMITS_DB ): db.download(BUG_INTRODUCING_COMMITS_DB, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_mercurial_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...") commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) bug_fixing_commits = find_bug_fixing_commits() logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"])) commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"]) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( bug_introducing_hash ), "bug_introducing_git_rev": bug_introducing_hash, } ) # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": "", "bug_introducing_git_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits) compress_file(BUG_INTRODUCING_COMMITS_DB)
def generate_test_scheduling_history(self): updated = download_check_etag(PUSH_DATA_LABEL_URL) if updated: zstd_decompress("push_data_label.json") assert os.path.exists( "push_data_label.json"), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history(): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data_label.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") ) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension) ) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension) ) self.past_regression_blocked_bugs_by[dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(dimension=dimension) ) self.past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension) )
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")
def generate_test_scheduling_history(self, granularity: str) -> None: # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) elif granularity == "config_group": test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB ) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( granularity ) def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 if granularity in ("label", "config_group"): test_scheduling.generate_failing_together_probabilities( granularity, push_data_iter(), push_data_count ) commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for ( i, (revisions, push_runnables, possible_regressions, likely_regressions), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity == "label": create_tar_zst(failing_together_db)
def retrieve_test_scheduling_history(self): os.makedirs("data", exist_ok=True) # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): try: download_check_etag(URL, "adr_cache.tar.xz") with tarfile.open("adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists( "data/adr_cache"), "Decompressed adr cache exists" except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet") # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """) # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # We'll use the past 3 months only for training the model, but we use 6 months to calculate # the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", "today-6month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) HISTORY_DATE_START = datetime.now() - relativedelta(months=3) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 past_failures = {} def get_past_failures(task, push_num): if task not in past_failures: past_failures[task] = repository.exp_queue( push_num, HISTORICAL_TIMESPAN + 1, 0) return past_failures[task][push_num] def generate_data(): commits_with_data = set() saved_nodes = set() push_num = 0 for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if any(task.startswith(j) for j in JOBS_TO_SKIP): continue total_failures = get_past_failures(task, push_num) past_7_pushes_failures = total_failures - get_past_failures( task, push_num - 7) past_14_pushes_failures = total_failures - get_past_failures( task, push_num - 14) past_28_pushes_failures = total_failures - get_past_failures( task, push_num - 28) past_56_pushes_failures = total_failures - get_past_failures( task, push_num - 56) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } if task in commit_push_data[1] or task in commit_push_data[ 2]: past_failures[task][push_num] = total_failures + 1 push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache")
def find_bug_introducing_commits(self, repo_dir, tokenized): from pydriller import GitRepository from pydriller.domain.commit import ModificationType logger.info("Download commits to ignore...") assert db.download(IGNORED_COMMITS_DB) commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info("Download bug-fixing classifications...") assert db.download(BUG_FIXING_COMMITS_DB) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB) if bug_fixing_commit["type"] in ["r", "d"] ] if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB else: db_path = BUG_INTRODUCING_COMMITS_DB def git_to_mercurial(revs): if tokenized: return (self.tokenized_git_to_mercurial[rev] for rev in revs) else: yield from vcs_map.git_to_mercurial(repo_dir, revs) def mercurial_to_git(revs): if tokenized: return (self.mercurial_to_tokenized_git[rev] for rev in revs) else: yield from vcs_map.mercurial_to_git(repo_dir, revs) logger.info("Download previously found bug-introducing commits...") db.download(db_path) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: git_hashes = mercurial_to_git( commit["rev"] for commit in tqdm(commits_to_ignore) if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) f.writelines("{}\n".format(git_hash) for git_hash in git_hashes) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) git_init_lock = threading.Lock() def _init(git_repo_dir): with git_init_lock: thread_local.git = GitRepository(git_repo_dir) # Call get_head in order to make pydriller initialize the repository. thread_local.git.get_head() def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = tuple( mercurial_to_git([bug_fixing_commit["rev"]]))[0] commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None def get_modification_path(mod): path = mod.new_path if (mod.change_type == ModificationType.RENAME or mod.change_type == ModificationType.DELETE): path = mod.old_path return path bug_introducing_modifications = {} for modification in commit.modifications: path = get_modification_path(modification) if path == "testing/web-platform/meta/MANIFEST.json": continue # Don't try to find the bug-introducing commit for modifications # in the bug-fixing commit to non-source code files. if repository.get_type( path) not in repository.SOURCE_CODE_TYPES_TO_EXT: continue bug_introducing_modifications.update( thread_local.git.get_commits_last_modified_lines( commit, modification=modification, hashes_to_ignore_path=os.path.realpath( "git_hashes_to_ignore"), )) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": tuple(git_to_mercurial([bug_introducing_hash]))[0], }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits def compress_and_upload(): zstd_compress(db_path) db.upload(db_path) workers = os.cpu_count() + 1 logger.info( f"Analyzing {len(bug_fixing_commits)} commits using {workers} workers..." ) with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(repo_dir, ), max_workers=workers) as executor: def results(): start_time = time.monotonic() futures = { executor.submit(find_bic, bug_fixing_commit): bug_fixing_commit["rev"] for bug_fixing_commit in bug_fixing_commits } for future in tqdm( concurrent.futures.as_completed(futures), total=len(futures), ): exc = future.exception() if exc is not None: logger.info( f"Exception {exc} while analyzing {futures[future]}" ) for f in futures: f.cancel() result = future.result() if result is not None: yield from result if time.monotonic() - start_time >= 3600: compress_and_upload() start_time = time.monotonic() db.append(db_path, results()) compress_and_upload()
def get_push_data( granularity: str, ) -> Tuple[Callable[[], Iterator[PushResult]], int, Tuple[Runnable, ...]]: if granularity == "label": push_data_db = PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = PUSH_DATA_CONFIG_GROUP_DB assert db.download(push_data_db) # In the last 28 pushes, we definitely run all possible runnables. push_data_count = 0 push_data_queue: Deque[PushResult] = collections.deque(maxlen=28) for elem in db.read(push_data_db): push_data_count += 1 push_data_queue.append(elem) logger.info(f"push data nodes: {push_data_count}") push_data = [( revisions, fix_revision, rename_runnables(granularity, push_tasks), rename_runnables(granularity, possible_regressions), rename_runnables(granularity, likely_regressions), ) for revisions, fix_revision, push_tasks, possible_regressions, likely_regressions in push_data_queue] if granularity == "config_group": all_groups_set = set( sum( ([Group(r[1]) for r in push_runnables] for _, _, push_runnables, _, _ in push_data), [], )) # Filter runnables we don't need. all_groups = filter_runnables(tuple(all_groups_set), cast(Set[Runnable], all_groups_set), "group") all_groups_set = set(all_groups) logger.info( f"{len(all_groups_set)} manifests run in the last 28 pushes") all_runnables_set = set( sum((list(push_runnables) for _, _, push_runnables, _, _ in push_data), [])) # Filter runnables we don't need. all_runnables = filter_runnables(tuple(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables) logger.info( f"{len(all_runnables_set)} runnables run in the last 28 pushes") def push_data_iter() -> Iterator[PushResult]: return (( revisions, fix_revision, filter_runnables( rename_runnables(granularity, push_tasks), all_runnables_set, granularity, ), filter_runnables( rename_runnables(granularity, possible_regressions), all_runnables_set, granularity, ), filter_runnables( rename_runnables(granularity, likely_regressions), all_runnables_set, granularity, ), ) for revisions, fix_revision, push_tasks, possible_regressions, likely_regressions in db.read(push_data_db)) if granularity == "config_group": manifest_combinations = sum( sum(1 for _ in itertools.combinations(sorted(group_tasks), 2)) for manifest, group_tasks in itertools.groupby( sorted(all_runnables, key=lambda x: x[1]), key=lambda x: x[1])) print( f"{manifest_combinations} possible combinations of manifests on configurations" ) return push_data_iter, push_data_count, all_runnables
def evaluate(bug_introducing_commits): logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Building bug -> commits map...") bug_to_commits_map = defaultdict(list) for commit in tqdm(repository.get_commits()): bug_to_commits_map[commit["bug_id"]].append(commit["node"]) logger.info("Loading known regressors using regressed-by information...") known_regressors = {} for bug in tqdm(bugzilla.get_bugs()): if bug["regressed_by"]: known_regressors[bug["id"]] = bug["regressed_by"] logger.info(f"Loaded {len(known_regressors)} known regressors") fix_to_regressors_map = defaultdict(list) for bug_introducing_commit in bug_introducing_commits: if not bug_introducing_commit["bug_introducing_rev"]: continue fix_to_regressors_map[bug_introducing_commit["bug_fixing_rev"]].append( bug_introducing_commit["bug_introducing_rev"]) logger.info(f"{len(fix_to_regressors_map)} fixes linked to regressors") logger.info( f"{sum(len(regressors) for regressors in fix_to_regressors_map.values())} regressors linked to fixes" ) logger.info( "Measuring how many known regressors SZZ was able to find correctly..." ) all_regressors = 0 perfect_regressors = 0 found_regressors = 0 misassigned_regressors = 0 for bug_id, regressor_bugs in tqdm(known_regressors.items()): # Get all commits which fixed the bug. fix_commits = bug_to_commits_map[ bug_id] if bug_id in bug_to_commits_map else [] if len(fix_commits) == 0: continue # Skip bug/regressor when we didn't analyze the commits to fix the bug (as # certainly we can't have found the regressor in this case). if not any(fix_commit in fix_to_regressors_map for fix_commit in fix_commits): continue # Get all commits linked to the regressor bug. regressor_commits = [] for regressor_bug in regressor_bugs: if regressor_bug not in bug_to_commits_map: continue regressor_commits += ( commit for commit in bug_to_commits_map[regressor_bug]) if len(regressor_commits) == 0: continue found_good = False found_bad = False for fix_commit in fix_commits: # Check if we found at least a correct regressor. if any(regressor_commit in regressor_commits for regressor_commit in fix_to_regressors_map[fix_commit]): found_good = True # Check if we found at least a wrong regressor. if any(regressor_commit not in regressor_commits for regressor_commit in fix_to_regressors_map[fix_commit]): found_bad = True all_regressors += 1 if found_good and not found_bad: perfect_regressors += 1 if found_good: found_regressors += 1 if found_bad: misassigned_regressors += 1 logger.info( f"Perfectly found {perfect_regressors} regressors out of {all_regressors}" ) logger.info(f"Found {found_regressors} regressors out of {all_regressors}") logger.info( f"Misassigned {misassigned_regressors} regressors out of {all_regressors}" )
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) # Analyze up to 500 commits at a time, to avoid the task running out of time. done = True if len(bug_fixing_commits) > 500: bug_fixing_commits = bug_fixing_commits[-500:] done = False with open("done", "w") as f: f.write(str(1 if done else 0)) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) logger.info(f"Analyzing {git_fix_revision}...") commit = GIT_REPO.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm(bug_introducing_commits, total=len(bug_fixing_commits)) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits)) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(db_path, bug_introducing_commits) compress_file(db_path)
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
model_class = DevDocNeededModel elif args.goal == "assignee": from bugbug.models.assignee import AssigneeModel model_class = AssigneeModel elif args.goal == "backout": from bugbug.models.backout import BackoutModel model_class = BackoutModel elif args.goal == "bugtype": from bugbug.models.bugtype import BugTypeModel model_class = BugTypeModel if args.train: db.download() if args.historical: model = model_class(args.lemmatization, args.historical) else: model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ' )
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None try: with open("data/past_failures.pickle", "rb") as f: past_failures, push_num = pickle.load(f) except FileNotFoundError: past_failures = {} push_num = 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): if type_ not in past_failures: past_failures[type_] = {} if task not in past_failures[type_]: past_failures[type_][task] = {} values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] for item in items: if item not in past_failures[type_][task]: past_failures[type_][task][item] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) value = past_failures[type_][task][item][push_num] values_total.append(value) values_prev_7.append( value - past_failures[type_][task][item][push_num - 7] ) values_prev_14.append( value - past_failures[type_][task][item][push_num - 14] ) values_prev_28.append( value - past_failures[type_][task][item][push_num - 28] ) values_prev_56.append( value - past_failures[type_][task][item][push_num - 56] ) if is_regression: past_failures[type_][task][item][push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open("data/past_failures.pickle", "wb") as f: pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_failures.pickle")