def download_issues(self) -> None: # Fetches all issues sorted by date of creation in ascending order url = "https://api.github.com/repos/{}/{}/issues".format( self.owner, self.repo) start_page = self.get_start_page() params = { "state": self.state, "sort": "created", "direction": "asc", "per_page": PER_PAGE, "page": start_page, } data, response_links = self.fetch_issues( url=url, retrieve_events=self.retrieve_events, params=params) db.append(self.db_path, data) # Fetch next page while "next" in response_links.keys(): next_page_data, response_links = self.fetch_issues( response_links["next"]["url"], self.retrieve_events) db.append(self.db_path, next_page_data) logger.info("Done downloading")
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") if db.is_old_version( IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB): db.download(IGNORED_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) if len(prev_commits_to_ignore) > 0: rev_start = "children({})".format( prev_commits_to_ignore[-1]["rev"]) else: rev_start = 0 # 2 days more than the end date, so we can know if a commit was backed-out. # We have to do this as recent commits might be missing in the mercurial <-> git map, # otherwise we could just use "tip". end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2) with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs( hg, rev_start, "pushdate('{}')".format(end_date.strftime("%Y-%m-%d"))) # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day). if len(prev_commits_to_ignore) > 0: found_prev = -1 for i, rev in enumerate(revs): if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]: found_prev = i break revs = revs[found_prev + 1:] commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.append(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) return prev_commits_to_ignore + commits_to_ignore
def download_issues(owner: str, repo: str, state: str, retrieve_events: bool = False) -> None: # Fetches all issues sorted by date of creation in ascending order url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo) start_page = get_start_page() params = { "state": state, "sort": "created", "direction": "asc", "per_page": PER_PAGE, "page": start_page, } data, response_links = fetch_issues(url=url, retrieve_events=retrieve_events, params=params) db.append(GITHUB_ISSUES_DB, data) # Fetch next page while "next" in response_links.keys(): next_page_data, response_links = fetch_issues( response_links["next"]["url"], retrieve_events) db.append(GITHUB_ISSUES_DB, next_page_data) logger.info("Done downloading")
def download_bugs(bug_ids, products=None, security=False): old_bug_count = 0 old_bugs = [] new_bug_ids = set(int(bug_id) for bug_id in bug_ids) for bug in get_bugs(): old_bug_count += 1 if int(bug['id']) in new_bug_ids: old_bugs.append(bug) new_bug_ids.remove(bug['id']) print(f'Loaded {old_bug_count} bugs.') print(f'To download {len(new_bug_ids)} bugs.') new_bug_ids = sorted(list(new_bug_ids)) total_downloaded = 0 chunks = (new_bug_ids[i:(i + 500)] for i in range(0, len(new_bug_ids), 500)) for chunk in chunks: new_bugs = _download(chunk) total_downloaded += len(new_bugs) print(f'Downloaded {total_downloaded} bugs') if not security: new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0} if products is not None: new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if bug['product'] in products} db.append(BUGS_DB, new_bugs.values())
def go(days: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") prev_scheduler_stat_revs = set( scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)) logger.info( f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..." ) to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(days=days) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes = [ push for push in pushes if push.rev not in prev_scheduler_stat_revs ] logger.info(f"{len(pushes)} left to analyze") db.append(SHADOW_SCHEDULER_STATS_DB, analyze_shadow_schedulers(pushes)) utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
def download_commits(repo_dir, rev_start=0, ret=False, save=True): hg = hglib.open(repo_dir) revs = get_revs(hg, rev_start) assert ( len(revs) > 0 ), "There should definitely be more than 0 commits, something is wrong" first_pushdate = hg_log(hg, [b"0"])[0].pushdate hg.close() processes = multiprocessing.cpu_count() print(f"Mining {len(revs)} commits using {processes} processes...") CHUNK_SIZE = 256 revs_groups = [ revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE) ] with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_hg_log, revs_groups, chunksize=20) commits = tqdm(commits, total=len(revs_groups)) commits = list(itertools.chain.from_iterable(commits)) print("Downloading file->component mapping...") download_component_mapping() commits_to_ignore = get_commits_to_ignore(repo_dir, commits) print(f"{len(commits_to_ignore)} commits to ignore") calculate_experiences(commits, commits_to_ignore, first_pushdate, save) # Exclude commits to ignore. commits = [commit for commit in commits if commit not in commits_to_ignore] commits_num = len(commits) print(f"Mining {commits_num} commits using {processes} processes...") global rs_parsepatch import rs_parsepatch with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) if ret: commits = list(commits) if save: db.append(COMMITS_DB, commits) if ret: return commits
def download_bugs(bug_ids, products=None, security=False): old_bug_count = 0 old_bugs = [] new_bug_ids = set(int(bug_id) for bug_id in bug_ids) for bug in get_bugs(): old_bug_count += 1 if int(bug['id']) in new_bug_ids: old_bugs.append(bug) new_bug_ids.remove(bug['id']) print(f'Loaded {old_bug_count} bugs.') new_bug_ids = sorted(list(new_bug_ids)) CHUNK_SIZE = 100 chunks = (new_bug_ids[i:(i + CHUNK_SIZE)] for i in range(0, len(new_bug_ids), CHUNK_SIZE)) with tqdm(total=len(new_bug_ids)) as progress_bar: for chunk in chunks: new_bugs = _download(chunk) progress_bar.update(len(chunk)) if not security: new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0} if products is not None: new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if bug['product'] in products} db.append(BUGS_DB, new_bugs.values())
def retrieve_issues(self, owner: str, repo: str, state: str, retrieve_events: bool) -> None: last_modified = None db.download(github.GITHUB_ISSUES_DB) try: last_modified = db.last_modified(github.GITHUB_ISSUES_DB) except Exception: pass if last_modified: logger.info( f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" ) data = github.fetch_issues_updated_since_timestamp( owner, repo, state, last_modified.isoformat(), retrieve_events) updated_ids = set(issue["id"] for issue in data) logger.info( "Deleting issues that were changed since the last run and saving updates" ) github.delete_issues(lambda issue: issue["id"] in updated_ids) db.append(github.GITHUB_ISSUES_DB, data) logger.info("Updating finished") else: logger.info( "Retrieving all issues since last_modified is not available") github.download_issues(owner, repo, state, retrieve_events) zstd_compress(github.GITHUB_ISSUES_DB)
def test_bad_format_compression(tmp_path, db_name): db_path = tmp_path / db_name db.register(db_path, "https://alink", 1) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def test_bad_format_compression(tmp_path, db_name): db_path = tmp_path / db_name db.register(db_path, "https://alink") with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def test_append(mock_db, db_format, db_compression): db_path = mock_db(db_format, db_compression) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def download_commits(repo_dir, rev_start=0, save=True, use_single_process=False): with hglib.open(repo_dir) as hg: revs = get_revs(hg, rev_start) if len(revs) == 0: print("No commits to analyze") return [] first_pushdate = get_first_pushdate(repo_dir) print(f"Mining {len(revs)} commits...") if not use_single_process: print(f"Using {os.cpu_count()} processes...") commits = hg_log_multi(repo_dir, revs) else: with hglib.open(repo_dir) as hg: commits = hg_log(hg, revs) print("Downloading file->component mapping...") download_component_mapping() set_commits_to_ignore(repo_dir, commits) commits_num = len(commits) print(f"Mining {commits_num} commits...") global rs_parsepatch import rs_parsepatch global code_analysis_server code_analysis_server = rust_code_analysis_server.RustCodeAnalysisServer() if not use_single_process: with concurrent.futures.ProcessPoolExecutor( initializer=_init_process, initargs=(repo_dir,) ) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) commits = list(commits) else: with hglib.open(repo_dir) as hg: commits = [transform(hg, repo_dir, c) for c in commits] code_analysis_server.terminate() calculate_experiences(commits, first_pushdate, save) commits = [commit.to_dict() for commit in commits if not commit.ignored] if save: db.append(COMMITS_DB, commits) return commits
def test_unregistered_db(tmp_path): db_path = tmp_path / "prova.json" with pytest.raises(AssertionError): list(db.read(db_path)) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def retrieve_issues(self) -> None: last_modified = None db.download(self.github.db_path) try: last_modified = db.last_modified(self.github.db_path) except db.LastModifiedNotAvailable: pass if last_modified: logger.info( f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" ) data = self.github.fetch_issues_updated_since_timestamp( last_modified.isoformat()) if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) self.replace_with_private(data) updated_ids = set(issue["id"] for issue in data) logger.info( "Deleting issues that were changed since the last run and saving updates" ) self.github.delete_issues(lambda issue: issue["id"] in updated_ids) db.append(self.github.db_path, data) logger.info("Updating finished") else: logger.info( "Retrieving all issues since last_modified is not available") self.github.download_issues() if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) all_issues = list(self.github.get_issues()) updated_issues, updated_ids = self.replace_with_private( all_issues) logger.info( "Deleting public issues that were updated and saving updates" ) self.github.delete_issues( lambda issue: issue["id"] in updated_ids) db.append(self.github.db_path, updated_issues) zstd_compress(self.github.db_path)
def test_append_compressed(tmp_path): db_path = tmp_path / 'prova.json.gz' db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def download_modified_revisions(): try: last_modified = db.last_modified(REVISIONS_DB) except LastModifiedNotAvailable: return modified_revisions = get(modified_start=last_modified) modified_revision_ids = set(rev["id"] for rev in modified_revisions) db.delete(REVISIONS_DB, lambda revision: revision["id"] in modified_revision_ids) db.append(REVISIONS_DB, modified_revisions)
def download_commits(repo_dir, rev_start=0, ret=False, save=True): hg = hglib.open(repo_dir) revs = get_revs(hg, rev_start) if len(revs) == 0: print("No commits to analyze") return [] first_pushdate = hg_log(hg, [b"0"])[0].pushdate hg.close() print(f"Mining {len(revs)} commits using {os.cpu_count()} processes...") commits = hg_log_multi(repo_dir, revs) print("Downloading file->component mapping...") download_component_mapping() commits_to_ignore = get_commits_to_ignore(repo_dir, commits) print(f"{len(commits_to_ignore)} commits to ignore") calculate_experiences(commits, commits_to_ignore, first_pushdate, save) # Exclude commits to ignore. commits = [commit for commit in commits if commit not in commits_to_ignore] commits_num = len(commits) print(f"Mining {commits_num} commits using {os.cpu_count()} processes...") global rs_parsepatch import rs_parsepatch with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) if ret: commits = list(commits) if save: db.append(COMMITS_DB, commits) if ret: return commits
def go(days: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") prev_scheduler_stat_revs = set( scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)) logger.info( f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..." ) to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(days=days) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes = [ push for push in pushes if push.rev not in prev_scheduler_stat_revs ] logger.info(f"{len(pushes)} left to analyze") def compress_and_upload() -> None: utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB) def results() -> Iterator[dict]: for i, push in enumerate(tqdm(pushes)): try: yield analyze_shadow_schedulers(push) except Exception: traceback.print_exc() # Upload every 42 pushes. if (i + 1) % 42 == 0: compress_and_upload() db.append(SHADOW_SCHEDULER_STATS_DB, results()) compress_and_upload()
def download_bugs(bug_ids, products=None, security=False): old_bug_count = 0 new_bug_ids = set(int(bug_id) for bug_id in bug_ids) for bug in get_bugs(include_invalid=True): old_bug_count += 1 if int(bug["id"]) in new_bug_ids: new_bug_ids.remove(bug["id"]) print(f"Loaded {old_bug_count} bugs.") new_bug_ids = sorted(list(new_bug_ids)) CHUNK_SIZE = 100 chunks = (new_bug_ids[i:(i + CHUNK_SIZE)] for i in range(0, len(new_bug_ids), CHUNK_SIZE)) @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=16, max=64), ) def get_chunk(chunk): new_bugs = get(chunk) if not security: new_bugs = [ bug for bug in new_bugs.values() if len(bug["groups"]) == 0 ] if products is not None: new_bugs = [ bug for bug in new_bugs.values() if bug["product"] in products ] return new_bugs with tqdm(total=len(new_bug_ids)) as progress_bar: for chunk in chunks: new_bugs = get_chunk(chunk) progress_bar.update(len(chunk)) db.append(BUGS_DB, new_bugs)
def download_bugs(bug_ids: Iterable[int], security: bool = False) -> List[BugDict]: old_bug_count = 0 new_bug_ids_set = set(int(bug_id) for bug_id in bug_ids) for bug in get_bugs(include_invalid=True): old_bug_count += 1 new_bug_ids_set.discard(int(bug["id"])) print(f"Loaded {old_bug_count} bugs.") new_bug_ids = sorted(list(new_bug_ids_set)) chunks = ( new_bug_ids[i:(i + Bugzilla.BUGZILLA_CHUNK_SIZE)] for i in range(0, len(new_bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE)) @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=16, max=64), ) def get_chunk(chunk: List[int]) -> List[BugDict]: new_bugs = get(chunk) if not security: new_bugs = [ bug for bug in new_bugs.values() if len(bug["groups"]) == 0 ] return new_bugs all_new_bugs = [] with tqdm(total=len(new_bug_ids)) as progress_bar: for chunk in chunks: new_bugs = get_chunk(chunk) progress_bar.update(len(chunk)) db.append(BUGS_DB, new_bugs) all_new_bugs += new_bugs return all_new_bugs
def download_bugs(bug_ids, products=None, security=False): old_bug_count = 0 old_bugs = [] new_bug_ids = set(int(bug_id) for bug_id in bug_ids) for bug in get_bugs(): old_bug_count += 1 if int(bug["id"]) in new_bug_ids: old_bugs.append(bug) new_bug_ids.remove(bug["id"]) print(f"Loaded {old_bug_count} bugs.") new_bug_ids = sorted(list(new_bug_ids)) CHUNK_SIZE = 100 chunks = ( new_bug_ids[i : (i + CHUNK_SIZE)] for i in range(0, len(new_bug_ids), CHUNK_SIZE) ) with tqdm(total=len(new_bug_ids)) as progress_bar: for chunk in chunks: new_bugs = _download(chunk) progress_bar.update(len(chunk)) if not security: new_bugs = { bug_id: bug for bug_id, bug in new_bugs.items() if len(bug["groups"]) == 0 } if products is not None: new_bugs = { bug_id: bug for bug_id, bug in new_bugs.items() if bug["product"] in products } db.append(BUGS_DB, new_bugs.values())
def download_revisions(rev_ids: Collection[int]) -> None: old_rev_count = 0 new_rev_ids = set(int(rev_id) for rev_id in rev_ids) for rev in get_revisions(): old_rev_count += 1 if rev["id"] in new_rev_ids: new_rev_ids.remove(rev["id"]) print(f"Loaded {old_rev_count} revisions.") new_rev_ids_list = sorted(list(new_rev_ids)) rev_ids_groups = (new_rev_ids_list[i:i + 100] for i in range(0, len(new_rev_ids_list), 100)) with tqdm(total=len(new_rev_ids)) as progress_bar: for rev_ids_group in rev_ids_groups: revisions = get(rev_ids_group) progress_bar.update(len(rev_ids_group)) db.append(REVISIONS_DB, revisions)
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = (task in commit_push_data[1] or task in commit_push_data[2]) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, commit_data["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, commit_data["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")
def find_bug_introducing_commits( self, bug_fixing_commits, commits_to_ignore, tokenized ): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines( "{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git ) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) # Analyze up to 500 commits at a time, to avoid the task running out of time. done = True if len(bug_fixing_commits) > 500: bug_fixing_commits = bug_fixing_commits[-500:] done = False with open("done", "w") as f: f.write(str(1 if done else 0)) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = GIT_REPO.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info( "Found {} for {}".format( bug_introducing_modifications, bug_fixing_commit["rev"] ) ) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial( bug_introducing_hash ), } ) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith("Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(db_path, bug_introducing_commits) compress_file(db_path)
def download_commits( repo_dir: str, rev_start: str = None, revs: List[bytes] = None, save: bool = True, use_single_process: bool = False, include_no_bug: bool = False, include_backouts: bool = False, include_ignored: bool = False, ) -> Tuple[CommitDict, ...]: assert revs is not None or rev_start is not None with hglib.open(repo_dir) as hg: if revs is None: revs = get_revs(hg, rev_start) if len(revs) == 0: logger.info("No commits to analyze") return tuple() first_pushdate = get_first_pushdate(repo_dir) logger.info(f"Mining {len(revs)} commits...") if not use_single_process: logger.info(f"Using {os.cpu_count()} processes...") commits = hg_log_multi(repo_dir, revs) else: commits = hg_log(hg, revs) if save or not os.path.exists("data/component_mapping.lmdb"): logger.info("Downloading file->component mapping...") download_component_mapping() set_commits_to_ignore(hg, repo_dir, commits) commits_num = len(commits) logger.info(f"Mining {commits_num} patches...") global code_analysis_server code_analysis_server = rust_code_analysis_server.RustCodeAnalysisServer( ) if not use_single_process: with concurrent.futures.ProcessPoolExecutor( initializer=_init_process, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) commits = tuple(commits) else: get_component_mapping() commits = tuple(transform(hg, repo_dir, c) for c in commits) close_component_mapping() code_analysis_server.terminate() calculate_experiences(commits, first_pushdate, save) logger.info("Applying final commits filtering...") commits = tuple(commit.to_dict() for commit in commits) if save: db.append(COMMITS_DB, commits) return tuple( filter_commits( commits, include_no_bug=include_no_bug, include_backouts=include_backouts, include_ignored=include_ignored, ))
def find_bug_introducing_commits(self, repo_dir, tokenized): from pydriller import GitRepository from pydriller.domain.commit import ModificationType logger.info("Download commits to ignore...") assert db.download(IGNORED_COMMITS_DB) commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info("Download bug-fixing classifications...") assert db.download(BUG_FIXING_COMMITS_DB) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB) if bug_fixing_commit["type"] in ["r", "d"] ] if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB else: db_path = BUG_INTRODUCING_COMMITS_DB def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(repo_dir, rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(repo_dir, rev) logger.info("Download previously found bug-introducing commits...") db.download(db_path) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) git_init_lock = threading.Lock() def _init(git_repo_dir): with git_init_lock: thread_local.git = GitRepository(git_repo_dir) # Call get_head in order to make pydriller initialize the repository. thread_local.git.get_head() def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None def get_modification_path(mod): path = mod.new_path if (mod.change_type == ModificationType.RENAME or mod.change_type == ModificationType.DELETE): path = mod.old_path return path bug_introducing_modifications = {} for modification in commit.modifications: if (get_modification_path(modification) == "testing/web-platform/meta/MANIFEST.json"): continue bug_introducing_modifications.update( thread_local.git.get_commits_last_modified_lines( commit, modification=modification, hashes_to_ignore_path=os.path.realpath( "git_hashes_to_ignore"), )) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits def compress_and_upload(): zstd_compress(db_path) db.upload(db_path) workers = os.cpu_count() + 1 logger.info( f"Analyzing {len(bug_fixing_commits)} commits using {workers} workers..." ) with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(repo_dir, ), max_workers=workers) as executor: def results(): start_time = time.monotonic() futures = { executor.submit(find_bic, bug_fixing_commit): bug_fixing_commit["rev"] for bug_fixing_commit in bug_fixing_commits } for future in tqdm( concurrent.futures.as_completed(futures), total=len(futures), ): exc = future.exception() if exc is not None: logger.info( f"Exception {exc} while analyzing {futures[future]}" ) for f in futures: f.cancel() result = future.result() if result is not None: yield from result if time.monotonic() - start_time >= 3600: compress_and_upload() start_time = time.monotonic() db.append(db_path, results()) compress_and_upload()
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) logger.info("Downloading bugs database...") if db.is_old_version( bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) logger.info("Download previous classifications...") if db.is_old_version( BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB): db.download(BUG_FIXING_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") defect_model = DefectEnhancementTaskModel.load( "defectenhancementtaskmodel") logger.info("Downloading regression model...") download_model("regression") regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits return [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ["r", "d"] ]
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): thread_local.git = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: def results(): num_analyzed = 0 bug_fixing_commits_queue = bug_fixing_commits.copy() # Analyze up to 500 commits at a time, to avoid the task running out of time. while len( bug_fixing_commits_queue) != 0 and num_analyzed != 500: bug_introducing_commit_futures = [] for _ in range( min(500 - num_analyzed, len(bug_fixing_commits))): bug_introducing_commit_futures.append( executor.submit(find_bic, bug_fixing_commits.pop())) logger.info( f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits" ) for future in tqdm( concurrent.futures.as_completed( bug_introducing_commit_futures), total=len(bug_introducing_commit_futures), ): result = future.result() if result is not None: num_analyzed += 1 yield from result with open("done", "w") as f: f.write( str(1 if len(bug_fixing_commits_queue) == 0 else 0)) db.append(db_path, results()) zstd_compress(db_path)
def download_bugs_between(date_from, date_to, security=False, store=True): products = { "Add-on SDK", "Android Background Services", "Core", "Core Graveyard", "DevTools", "DevTools Graveyard", "External Software Affecting Firefox", "Firefox", "Firefox Graveyard", "Firefox Build System", "Firefox for Android", "Firefox for Android Graveyard", # 'Firefox for iOS', "Firefox Health Report", # 'Focus', # 'Hello (Loop)', "NSPR", "NSS", "Toolkit", "Toolkit Graveyard", "WebExtensions", } params = { "f1": "creation_ts", "o1": "greaterthan", "v1": date_from.strftime("%Y-%m-%d"), "f2": "creation_ts", "o2": "lessthan", "v2": date_to.strftime("%Y-%m-%d"), "product": products, } if not security: params["f3"] = "bug_group" params["o3"] = "isempty" params["count_only"] = 1 r = requests.get("https://bugzilla.mozilla.org/rest/bug", params=params) r.raise_for_status() count = r.json()["bug_count"] del params["count_only"] params["limit"] = 100 params["order"] = "bug_id" old_bug_ids = set(bug["id"] for bug in get_bugs()) all_bugs = [] with tqdm(total=count) as progress_bar: for offset in range(0, count, Bugzilla.BUGZILLA_CHUNK_SIZE): params["offset"] = offset new_bugs = _download(params) progress_bar.update(Bugzilla.BUGZILLA_CHUNK_SIZE) all_bugs += [bug for bug in new_bugs.values()] if store: db.append( BUGS_DB, (bug for bug_id, bug in new_bugs.items() if bug_id not in old_bug_ids), ) return all_bugs
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity) ) if updated: zstd_decompress(push_data_path) os.remove(f"{push_data_path}.zst") assert os.path.exists(push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history(granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[: -len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[: -len(".tar.zst")]) if granularity == "label": with open_tar_zst(failing_together_db) as tar: tar.add(failing_together_db[: -len(".tar.zst")])
def generate_test_scheduling_history(self, granularity: str, training_months: int) -> None: if granularity != "config_group": # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=training_months) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) elif granularity == "config_group": test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( granularity) if granularity in ("label", "config_group"): test_scheduling.generate_failing_together_probabilities( granularity, push_data_iter(), push_data_count) def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures( granularity, False) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for ( i, ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue # Skip wptsync commits, since they are not like normal pushes made by developers. if any(repository.is_wptsync(commit) for commit in commits): continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( granularity, past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() # For the config/group granularity, we are only interested in the failing together DB. if granularity != "config_group": db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity in ("label", "config_group"): create_tar_zst(failing_together_db)
def generate_push_data( self, granularity: str, training_months: int, reretrieve: int ) -> None: # We'll use the past training_months months only for training the model, # but we use half training_months months more than that to calculate the # failure statistics. from_months = training_months + math.floor(training_months / 2) # We use the actual date instead of 'today-X' aliases to avoid mozci caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( progress_bar: tqdm, pushes: List[mozci.push.Push], futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) for push, future in zip(pushes, futures): cached = future.result() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.label_summaries.keys() elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() progress_bar.update(1) logger.info(f"{num_cached} pushes were already cached out of {num_pushes}") def retrieve_from_cache(push): return mozci.config.cache.get(cache_key(push)) total_pushes = len( mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) ) with concurrent.futures.ThreadPoolExecutor() as executor: with tqdm(total=total_pushes) as progress_bar: # Run in batches of 7 days to avoid running out of memory (given that mozci pushes # consume a lot of memory, and they all have references to each other through "parent" # and "child" links so they are basically never released while we run this). while from_date < to_date: next_from_date = from_date + relativedelta(days=7) if next_from_date > to_date: next_from_date = to_date pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=next_from_date.strftime("%Y-%m-%d"), branch="autoland", ) futures = [ executor.submit(retrieve_from_cache, push) for push in pushes ] try: db.append(push_data_db, generate(progress_bar, pushes, futures)) except Exception: for f in futures: f.cancel() raise from_date = next_from_date zstd_compress(push_data_db)
def download_bugs_between(date_from, date_to, security=False): products = { "Add-on SDK", "Android Background Services", "Core", "Core Graveyard", "DevTools", "DevTools Graveyard", "External Software Affecting Firefox", "Firefox", "Firefox Graveyard", "Firefox Build System", "Firefox for Android", "Firefox for Android Graveyard", # 'Firefox for iOS', "Firefox Health Report", # 'Focus', # 'Hello (Loop)', "NSPR", "NSS", "Toolkit", "Toolkit Graveyard", "WebExtensions", } params = { "f1": "creation_ts", "o1": "greaterthan", "v1": date_from.strftime("%Y-%m-%d"), "f2": "creation_ts", "o2": "lessthan", "v2": date_to.strftime("%Y-%m-%d"), "product": products, } if not security: params["f3"] = "bug_group" params["o3"] = "isempty" params["count_only"] = 1 r = requests.get("https://bugzilla.mozilla.org/rest/bug", params=params) r.raise_for_status() count = r.json()["bug_count"] del params["count_only"] params["limit"] = 100 params["order"] = "bug_id" old_bug_ids = set(bug["id"] for bug in get_bugs()) all_bugs = [] with tqdm(total=count) as progress_bar: for offset in range(0, count, Bugzilla.BUGZILLA_CHUNK_SIZE): params["offset"] = offset new_bugs = _download(params) progress_bar.update(Bugzilla.BUGZILLA_CHUNK_SIZE) all_bugs += [bug for bug in new_bugs.values()] db.append( BUGS_DB, (bug for bug_id, bug in new_bugs.items() if bug_id not in old_bug_ids), ) return all_bugs