def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" with hglib.open(self.repo_dir) as hg: revs = repository.get_revs(hg, rev_start) chunk_size = 70000 for i in range(0, len(revs), chunk_size): repository.download_commits(self.repo_dir, revs=revs[i:(i + chunk_size)]) logger.info("commit data extracted from repository") # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() zstd_compress(repository.COMMITS_DB) create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
def __init__( self, cache_root, git_repo_url, git_repo_dir, tokenized_git_repo_url, tokenized_git_repo_dir, ): self.mercurial_repo_dir = os.path.join(cache_root, "mozilla-central") self.git_repo_url = git_repo_url self.git_repo_dir = git_repo_dir self.tokenized_git_repo_url = tokenized_git_repo_url self.tokenized_git_repo_dir = tokenized_git_repo_dir logger.info(f"Cloning mercurial repository to {self.mercurial_repo_dir}...") repository.clone(self.mercurial_repo_dir) logger.info(f"Cloning {self.git_repo_url} to {self.git_repo_dir}...") self.clone_git_repo(self.git_repo_url, self.git_repo_dir) logger.info( f"Cloning {self.tokenized_git_repo_url} to {self.tokenized_git_repo_dir}..." ) self.clone_git_repo(self.tokenized_git_repo_url, self.tokenized_git_repo_dir) logger.info(f"Initializing mapping between git and mercurial commits...") self.init_mapping()
def __init__(self, repo_dir: str) -> None: if not os.path.exists(repo_dir): repository.clone(repo_dir) else: repository.pull(repo_dir, "mozilla-central", "tip") logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def boot_worker(): # Preload models bugbug_http.models.preload_models() # Clone mozilla central repo_dir = os.environ.get("BUGBUG_REPO_DIR", os.path.join(tempfile.gettempdir(), "bugbug-hg")) logger.info(f"Cloning mozilla-central in {repo_dir}...") repository.clone(repo_dir) # Download databases logger.info("Downloading test scheduling DB support file...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") repository.download_commits(repo_dir, rev_start) logger.info("Worker boot done")
def __init__(self, repo_dir: str) -> None: repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = download_and_load_model("regressor") bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def boot_worker(): # Clone autoland logger.info(f"Cloning mozilla autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") # Download test scheduling DB support files. logger.info("Downloading test scheduling DB support files...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass # Preload models bugbug_http.models.preload_models() logger.info("Worker boot done")
def update_commit_db(self): repository.clone(self.repo_dir, update=True) assert db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass repository.download_commits(self.repo_dir, rev_start="children({})".format( commit["node"]))
def generate(self): repository.clone(self.repo_dir) logger.info("mozilla-central cloned") git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") git_repo_path = os.path.basename(self.repo_url) retry(lambda: subprocess.run( ["git", "clone", self.repo_url, git_repo_path], check=True)) try: retry(lambda: subprocess.run( ["git", "pull", self.repo_url, "master"], cwd=git_repo_path, capture_output=True, check=True, )) except subprocess.CalledProcessError as e: # When the repo is empty. if b"Couldn't find remote ref master" in e.stdout: pass retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) for i in range(STEPS): logger.info(f"Step {i} out of {STEPS}") done = generator.generate( self.repo_dir, git_repo_path, limit=TOTAL_COMMITS // STEPS, tokenize=self.tokenize, remove_comments=self.remove_comments, ) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True, )) if done: break
def update_commit_db(self): repository.clone(self.repo_dir) if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True, support_files_too=True) for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) repository.download_commits(self.repo_dir, rev_start)
def retrieve_commits(self): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") zstd_compress("data/commit_experiences.pickle")
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" repository.download_commits(self.repo_dir, rev_start=rev_start) logger.info("commit data extracted from repository") zstd_compress(repository.COMMITS_DB) create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB) and not limit: db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: if limit: rev_start = ( -1 * limit ) # Mercurial revset support negative integers starting from tip else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") zstd_compress("data/commit_experiences.pickle")
def clone_autoland(): logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland")
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))
def find_bug_introducing_commits(cache_dir, git_repo_dir): mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central") logger.info("Downloading Mercurial <-> git mapping file...") vcs_map.download_mapfile() logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...") repository.clone(mercurial_repo_dir) logger.info(f"Cloning git repository to {git_repo_dir}...") clone_gecko_dev(git_repo_dir) logger.info("Download previously found bug-introducing commits...") db.download_version(BUG_INTRODUCING_COMMITS_DB) if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists( BUG_INTRODUCING_COMMITS_DB ): db.download(BUG_INTRODUCING_COMMITS_DB, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_mercurial_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...") commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) bug_fixing_commits = find_bug_fixing_commits() logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"])) commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"]) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( bug_introducing_hash ), "bug_introducing_git_rev": bug_introducing_hash, } ) # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": "", "bug_introducing_git_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits) compress_file(BUG_INTRODUCING_COMMITS_DB)