def test_download_version(tmp_path): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version" db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1, support_files=[]) responses.add(responses.HEAD, url_version, status=200, headers={"ETag": "123"}) responses.add(responses.GET, url_version, status=200, body="42") db.download_version(db_path) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version")) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version.etag")) assert not db.is_old_version(db_path) db.register(db_path, url_zst, 43, support_files=[]) assert db.is_old_version(db_path)
def test_is_old_version(tmp_path): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version" db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1, support_files=[]) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version")) responses.add(responses.GET, url_version, status=404) responses.add(responses.GET, url_version, status=424) responses.add(responses.GET, url_version, status=200, body="1") responses.add(responses.GET, url_version, status=200, body="42") # When the remote version file doesn't exist, we consider the db as being old. assert db.is_old_version(db_path) # When the remote version file doesn't exist, we consider the db as being old. assert db.is_old_version(db_path) # When the remote version file exists and returns the same version as the current db, we consider the remote db as not being old. assert not db.is_old_version(db_path) # When the remote version file exists and returns a newer version than the current db, we consider the remote db as not being old. assert not db.is_old_version(db_path) db.register(db_path, url_zst, 43, support_files=[]) # When the remote version file exists and returns an older version than the current db, we consider the remote db as being old. assert db.is_old_version(db_path)
def test_register_db(tmp_path): db_path = tmp_path / "prova.json" db.register(db_path, "https://alink", 1) assert not db.is_old_version(db_path) db.register(db_path, "https://alink", 2) assert db.is_old_version(db_path)
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") if db.is_old_version( IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB): db.download(IGNORED_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) if len(prev_commits_to_ignore) > 0: rev_start = "children({})".format( prev_commits_to_ignore[-1]["rev"]) else: rev_start = 0 # 2 days more than the end date, so we can know if a commit was backed-out. # We have to do this as recent commits might be missing in the mercurial <-> git map, # otherwise we could just use "tip". end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2) with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs( hg, rev_start, "pushdate('{}')".format(end_date.strftime("%Y-%m-%d"))) # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day). if len(prev_commits_to_ignore) > 0: found_prev = -1 for i, rev in enumerate(revs): if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]: found_prev = i break revs = revs[found_prev + 1:] commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.append(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) return prev_commits_to_ignore + commits_to_ignore
def update_commit_db(self): repository.clone(self.repo_dir) if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True, support_files_too=True) for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) repository.download_commits(self.repo_dir, rev_start)
def retrieve_commits(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") # Pull and update, to make sure the pushlog is generated. hg = hglib.open(self.repo_dir) hg.pull(update=True) hg.close() db.download_version(repository.COMMITS_DB) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def retrieve_push_data(self): # Download previous cache. cache_path = os.path.splitext(ADR_CACHE_DB)[0] if not db.is_old_version(ADR_CACHE_DB): db.download(ADR_CACHE_DB) if os.path.exists(ADR_CACHE_DB): with tarfile.open(ADR_CACHE_DB, "r") as tar: tar.extractall() assert os.path.exists( cache_path), "Decompressed adr cache exists" # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{os.path.abspath(cache_path)}" }} """) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) with tarfile.open(ADR_CACHE_DB, "w") as tar: tar.add(cache_path) zstd_compress(ADR_CACHE_DB) zstd_compress("push_data.json")
def __init__(self): self.model_class = RegressorModel self.repo_dir = get_login_info()["repo_dir"] if not os.path.exists(self.repo_dir): cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=self.repo_dir + "-shared", networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") # Remove pushlog DB to make sure it's regenerated. try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") logger.info("Pulling and updating mozilla-central") with hglib.open(self.repo_dir) as hg: hg.pull(update=True) logger.info("mozilla-central pulled and updated") db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True, support_files_too=True) super().__init__() self.model = self.model_class.load(self.retrieve_model())
def main(): args = parse_args(sys.argv[1:]) logger.info("Downloading bugs database...") if db.is_old_version(bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) if args.algorithm == "neighbors_tfidf_bigrams": model = similarity.model_name_to_class[args.algorithm]( vectorizer=TfidfVectorizer(ngram_range=(1, 2)), cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer, ) else: model = similarity.model_name_to_class[args.algorithm]( cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer) model.save()
def retrieve_commits(self): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB) and not limit: db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") zstd_compress("data/commit_experiences.pickle")
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): thread_local.git = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: def results(): num_analyzed = 0 bug_fixing_commits_queue = bug_fixing_commits.copy() # Analyze up to 500 commits at a time, to avoid the task running out of time. while len( bug_fixing_commits_queue) != 0 and num_analyzed != 500: bug_introducing_commit_futures = [] for _ in range( min(500 - num_analyzed, len(bug_fixing_commits))): bug_introducing_commit_futures.append( executor.submit(find_bic, bug_fixing_commits.pop())) logger.info( f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits" ) for future in tqdm( concurrent.futures.as_completed( bug_introducing_commit_futures), total=len(bug_introducing_commit_futures), ): result = future.result() if result is not None: num_analyzed += 1 yield from result with open("done", "w") as f: f.write( str(1 if len(bug_fixing_commits_queue) == 0 else 0)) db.append(db_path, results()) zstd_compress(db_path)
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def retrieve_test_scheduling_history(self): os.makedirs("data", exist_ok=True) # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): try: download_check_etag(URL, "adr_cache.tar.xz") with tarfile.open("adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists( "data/adr_cache"), "Decompressed adr cache exists" except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet") # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """) # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 past_failures = {} def get_past_failures(task, push_num): if task not in past_failures: past_failures[task] = repository.exp_queue( push_num, HISTORICAL_TIMESPAN + 1, 0) return past_failures[task][push_num] def generate_data(): commits_with_data = set() saved_nodes = set() push_num = 0 for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue total_failures = get_past_failures(task, push_num) past_7_pushes_failures = total_failures - get_past_failures( task, push_num - 7) past_14_pushes_failures = total_failures - get_past_failures( task, push_num - 14) past_28_pushes_failures = total_failures - get_past_failures( task, push_num - 28) past_56_pushes_failures = total_failures - get_past_failures( task, push_num - 56) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } if task in commit_push_data[1] or task in commit_push_data[ 2]: past_failures[task][push_num] = total_failures + 1 push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache")
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) logger.info("Downloading bugs database...") if db.is_old_version( bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) logger.info("Download previous classifications...") if db.is_old_version( BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB): db.download(BUG_FIXING_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") defect_model = DefectEnhancementTaskModel.load( "defectenhancementtaskmodel") logger.info("Downloading regression model...") download_model("regression") regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits return [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ["r", "d"] ]
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=2, months=6) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids) all_ids = timespan_ids + labelled_bug_ids + commit_bug_ids all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def find_bug_introducing_commits( self, bug_fixing_commits, commits_to_ignore, tokenized ): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines( "{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git ) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) # Analyze up to 500 commits at a time, to avoid the task running out of time. done = True if len(bug_fixing_commits) > 500: bug_fixing_commits = bug_fixing_commits[-500:] done = False with open("done", "w") as f: f.write(str(1 if done else 0)) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = GIT_REPO.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info( "Found {} for {}".format( bug_introducing_modifications, bug_fixing_commit["rev"] ) ) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial( bug_introducing_hash ), } ) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith("Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(db_path, bug_introducing_commits) compress_file(db_path)
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = (task in commit_push_data[1] or task in commit_push_data[2]) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, commit_data["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, commit_data["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None try: with open("data/past_failures.pickle", "rb") as f: past_failures, push_num = pickle.load(f) except FileNotFoundError: past_failures = {} push_num = 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): if type_ not in past_failures: past_failures[type_] = {} if task not in past_failures[type_]: past_failures[type_][task] = {} values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] for item in items: if item not in past_failures[type_][task]: past_failures[type_][task][item] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) value = past_failures[type_][task][item][push_num] values_total.append(value) values_prev_7.append( value - past_failures[type_][task][item][push_num - 7] ) values_prev_14.append( value - past_failures[type_][task][item][push_num - 14] ) values_prev_28.append( value - past_failures[type_][task][item][push_num - 28] ) values_prev_56.append( value - past_failures[type_][task][item][push_num - 56] ) if is_regression: past_failures[type_][task][item][push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open("data/past_failures.pickle", "wb") as f: pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_failures.pickle")
def find_bug_introducing_commits(cache_dir, git_repo_dir): mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central") logger.info("Downloading Mercurial <-> git mapping file...") vcs_map.download_mapfile() logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...") repository.clone(mercurial_repo_dir) logger.info(f"Cloning git repository to {git_repo_dir}...") clone_gecko_dev(git_repo_dir) logger.info("Download previously found bug-introducing commits...") db.download_version(BUG_INTRODUCING_COMMITS_DB) if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists( BUG_INTRODUCING_COMMITS_DB ): db.download(BUG_INTRODUCING_COMMITS_DB, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_mercurial_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...") commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) bug_fixing_commits = find_bug_fixing_commits() logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"])) commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"]) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( bug_introducing_hash ), "bug_introducing_git_rev": bug_introducing_hash, } ) # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": "", "bug_introducing_git_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits) compress_file(BUG_INTRODUCING_COMMITS_DB)