def get_commit_data(commit_list: List[repository.CommitDict]) -> List[dict]: if len(commit_list) == 0: return [] # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project(revision_map[revision_id]) if testing is None: testing = "missing" else: testing = None commits_data.append( { "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "author": commit["author_email"], "reviewers": commit["reviewers"], "coverage": [ commit["cov_added"], commit["cov_covered"], commit["cov_unknown"], ], } ) return commits_data
def list_testing_projects( commits: Iterable[repository.CommitDict], ) -> Collection[str]: return list( filter( None, (phabricator.get_testing_project( revision_map[repository.get_revision_id(commit)]) for commit in commits), ))
def go(self, bugs: List[int], meta_bugs: Optional[List[int]] = None) -> None: if meta_bugs is not None: bugs += meta_bugs + self.get_blocking_of(meta_bugs) logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs) bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)} logger.info(f"{len(commits)} commits to analyze.") bug_ids = {commit["bug_id"] for commit in commits} logger.info(f"{len(bug_ids)} bugs to analyze.") bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): if bug["id"] in bugs_set: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } if meta_bugs is not None: blocker_to_meta = collections.defaultdict(set) for meta_bug in meta_bugs: if meta_bug not in bug_map: continue for blocker_bug_id in bugzilla.find_blocking( bug_map, bug_map[meta_bug]): blocker_to_meta[blocker_bug_id].add(meta_bug) # TODO: Use past regressions by function information too (maybe first by function and if no results by file? or prioritize function and recentness?) def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f) past_regressions_by_file = _download_past_bugs( PAST_REGRESSIONS_BY_FILE_URL) past_fixed_bugs_by_file = _download_past_bugs( PAST_FIXED_BUGS_BY_FILE_URL) past_regression_blocked_bugs_by_file = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_FILE_URL) past_fixed_bug_blocked_bugs_by_file = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_FILE_URL) def component_histogram(bugs: List[dict]) -> Dict[str, float]: counter = collections.Counter(bug["component"] for bug in bugs) return { component: count / len(bugs) for component, count in counter.most_common() } # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID. commits.sort(key=lambda x: x["bug_id"]) commit_groups = [] for bug_id, commit_iter in itertools.groupby(commits, lambda x: x["bug_id"]): # TODO: Figure out what to do with bugs we couldn't download (security bugs). if bug_id not in bug_map: continue commit_list = list(commit_iter) commit_list.sort(key=lambda x: hash_to_rev[x["node"]]) # Find previous regressions occurred in the same files as those touched by these commits. # And find previous bugs that were fixed by touching the same files as these commits. # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits. # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits. prev_regressions: List[Dict[str, Any]] = [] prev_fixed_bugs: List[Dict[str, Any]] = [] prev_regression_blocked_bugs: List[Dict[str, Any]] = [] prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = [] for commit in commit_list: for path in commit["files"]: if path in past_regressions_by_file: prev_regressions.extend( bug_summary for bug_summary in past_regressions_by_file[path]) if path in past_fixed_bugs_by_file: prev_fixed_bugs.extend( bug_summary for bug_summary in past_fixed_bugs_by_file[path]) if path in past_regression_blocked_bugs_by_file: prev_regression_blocked_bugs.extend( bug_summary for bug_summary in past_regression_blocked_bugs_by_file[path]) if path in past_fixed_bug_blocked_bugs_by_file: prev_fixed_bug_blocked_bugs.extend( bug_summary for bug_summary in past_fixed_bug_blocked_bugs_by_file[path]) prev_regressions = _deduplicate(prev_regressions) prev_fixed_bugs = _deduplicate(prev_fixed_bugs) prev_regression_blocked_bugs = _deduplicate( prev_regression_blocked_bugs) prev_fixed_bug_blocked_bugs = _deduplicate( prev_fixed_bug_blocked_bugs) regression_components = component_histogram(prev_regressions) fixed_bugs_components = component_histogram(prev_fixed_bugs) regression_blocked_bug_components = component_histogram( prev_regression_blocked_bugs) fixed_bug_blocked_bug_components = component_histogram( prev_fixed_bug_blocked_bugs) # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project( revision_map[revision_id]) if testing is None: testing = "none" else: testing = None commits_data.append({ "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "regressor": commit["bug_id"] in regressor_bug_ids, }) bug = bug_map[bug_id] commit_groups.append({ "id": bug_id, "versions": bugzilla.get_fixed_versions(bug), "component": "{}::{}".format(bug["product"], bug["component"]), "summary": bug["summary"], "date": max( dateutil.parser.parse(commit["pushdate"]) for commit in commit_list).strftime("%Y-%m-%d"), "commits": commits_data, "meta_ids": list(blocker_to_meta[bug_id]), "prev_regressions": prev_regressions[-3:], "prev_fixed_bugs": prev_fixed_bugs[-3:], "prev_regression_blocked_bugs": prev_regression_blocked_bugs[-3:], "prev_fixed_bug_blocked_bugs": prev_fixed_bug_blocked_bugs[-3:], "most_common_regression_components": regression_components, "most_common_fixed_bugs_components": fixed_bugs_components, "most_common_regression_blocked_bug_components": regression_blocked_bug_components, "most_common_fixed_bug_blocked_bug_components": fixed_bug_blocked_bug_components, }) landings_by_date = collections.defaultdict(list) for commit_group in commit_groups: landings_by_date[commit_group["date"]].append(commit_group) with open("landings_by_date.json", "w") as f: output: dict = { "landings": landings_by_date, } if meta_bugs is not None: output["featureMetaBugs"] = [{ "id": meta_bug, "summary": bug_map[meta_bug]["summary"] } for meta_bug in meta_bugs] json.dump(output, f)
def go(self, days_start: int, days_end: int) -> None: commits = self.get_landed_since(days_start, days_end) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } logger.info("Download bugs of interest...") bugzilla.download_bugs(commit["bug_id"] for commit in commits if commit["bug_id"]) # Filter-out commits with no Phabricator revision linked to them, or with no testing tags. commits = [ commit for commit in commits if repository.get_revision_id(commit) in revision_map ] logger.info(f"{len(commits)} revisions") # Filter-out commits with no testing tags. commits = [ commit for commit in commits if phabricator.get_testing_project( revision_map[repository.get_revision_id(commit)]) is not None ] logger.info(f"{len(commits)} revisions with testing tags") def list_testing_projects( commits: Iterable[repository.CommitDict], ) -> Collection[str]: return list( filter( None, (phabricator.get_testing_project( revision_map[repository.get_revision_id(commit)]) for commit in commits), )) testing_projects = list_testing_projects(commits) print(f"Most common testing tags (in {len(commits)} revisions):") for testing_project, count in collections.Counter( testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(testing_projects), 1)}%" ) backedout_commits = [ commit for commit in commits if commit["backedoutby"] ] backedout_testing_projects = list_testing_projects(backedout_commits) print( f"\nMost common testing tags for backed-out revisions (in {len(backedout_commits)} revisions):" ) for testing_project, count in collections.Counter( backedout_testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(backedout_testing_projects), 1)}%" ) regressor_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if len(bug["regressions"]) > 0 } regressor_commits = [ commit for commit in commits if commit["bug_id"] in regressor_bug_ids ] regressor_testing_projects = list_testing_projects(regressor_commits) print( f"\nMost common testing tags for revisions which caused regressions (in {len(regressor_commits)} revisions):" ) for testing_project, count in collections.Counter( regressor_testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(regressor_testing_projects), 1)}%" )