def test_download_missing(tmp_path): url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" db_path = tmp_path / "prova.json" db.register(db_path, url, 1) responses.add( responses.HEAD, url, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add(responses.GET, url, status=404, body=requests.exceptions.HTTPError("HTTP error")) db.download(db_path) assert not os.path.exists(db_path) with pytest.raises(Exception, match="Last-Modified is not available"): db.last_modified(db_path)
def test_download_different_schema(tmp_path, mock_zst): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.version" db_path = tmp_path / "prova.json" db.register(db_path, url, 2) responses.add(responses.GET, url_version, status=200, body="1") responses.add( responses.HEAD, url, status=200, headers={ "ETag": "123", "Accept-Encoding": "zstd", "Last-Modified": "2019-04-16", }, ) tmp_zst_path = tmp_path / "prova_tmp.zst" mock_zst(tmp_zst_path) with open(tmp_zst_path, "rb") as content: responses.add(responses.GET, url, status=200, body=content.read()) assert not db.download(db_path) with pytest.raises(db.LastModifiedNotAvailable): db.last_modified(db_path) assert not os.path.exists(db_path) assert not os.path.exists(db_path.with_suffix(db_path.suffix + ".zst")) assert not os.path.exists( db_path.with_suffix(db_path.suffix + ".zst.etag"))
def test_download_missing(tmp_path, mock_zst): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.version" db_path = tmp_path / "prova.json" db.register(db_path, url, 1) responses.add( responses.HEAD, url, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add(responses.GET, url, status=404, body=requests.exceptions.HTTPError("HTTP error")) responses.add(responses.GET, url_version, status=404) assert not db.download(db_path) assert not os.path.exists(db_path) with pytest.raises(LastModifiedNotAvailable): db.last_modified(db_path)
def test_download_zst(tmp_path, mock_zst): url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" db_path = tmp_path / "prova.json" db.register(db_path, url, 1) responses.add( responses.HEAD, url, status=200, headers={ "ETag": "123", "Accept-Encoding": "zstd", "Last-Modified": "2019-04-16", }, ) tmp_zst_path = tmp_path / "prova_tmp.zst" mock_zst(tmp_zst_path) with open(tmp_zst_path, "rb") as content: responses.add(responses.GET, url, status=200, body=content.read()) db.download(db_path) assert db.last_modified(db_path) == datetime(2019, 4, 16) assert os.path.exists(db_path) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".zst")) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".zst.etag"))
def retrieve_issues(self, owner: str, repo: str, state: str, retrieve_events: bool) -> None: last_modified = None db.download(github.GITHUB_ISSUES_DB) try: last_modified = db.last_modified(github.GITHUB_ISSUES_DB) except Exception: pass if last_modified: logger.info( f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" ) data = github.fetch_issues_updated_since_timestamp( owner, repo, state, last_modified.isoformat(), retrieve_events) updated_ids = set(issue["id"] for issue in data) logger.info( "Deleting issues that were changed since the last run and saving updates" ) github.delete_issues(lambda issue: issue["id"] in updated_ids) db.append(github.GITHUB_ISSUES_DB, data) logger.info("Updating finished") else: logger.info( "Retrieving all issues since last_modified is not available") github.download_issues(owner, repo, state, retrieve_events) zstd_compress(github.GITHUB_ISSUES_DB)
def retrieve_issues(self) -> None: last_modified = None db.download(self.github.db_path) try: last_modified = db.last_modified(self.github.db_path) except db.LastModifiedNotAvailable: pass if last_modified: logger.info( f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" ) data = self.github.fetch_issues_updated_since_timestamp( last_modified.isoformat()) if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) self.replace_with_private(data) updated_ids = set(issue["id"] for issue in data) logger.info( "Deleting issues that were changed since the last run and saving updates" ) self.github.delete_issues(lambda issue: issue["id"] in updated_ids) db.append(self.github.db_path, data) logger.info("Updating finished") else: logger.info( "Retrieving all issues since last_modified is not available") self.github.download_issues() if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) all_issues = list(self.github.get_issues()) updated_issues, updated_ids = self.replace_with_private( all_issues) logger.info( "Deleting public issues that were updated and saving updates" ) self.github.delete_issues( lambda issue: issue["id"] in updated_ids) db.append(self.github.db_path, updated_issues) zstd_compress(self.github.db_path)
def download_modified_revisions(): try: last_modified = db.last_modified(REVISIONS_DB) except LastModifiedNotAvailable: return modified_revisions = get(modified_start=last_modified) modified_revision_ids = set(rev["id"] for rev in modified_revisions) db.delete(REVISIONS_DB, lambda revision: revision["id"] in modified_revision_ids) db.append(REVISIONS_DB, modified_revisions)
def go(self, days: int) -> None: bugs = self.get_landed_and_filed_since(days) meta_bugs = self.get_meta_bugs(days) last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Deleting bugs modified since the last run on {last_modified}") changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids) bugs = list(set(bugs)) test_infos = self.retrieve_test_info(days) test_info_bugs: List[int] = [ bug["id"] for test_info in test_infos.values() for bug in test_info["bugs"] ] logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs + test_info_bugs + [FUZZING_METABUG_ID] + meta_bugs) logger.info(f"{len(bugs)} bugs to analyze.") bugs_set = set(bugs + test_info_bugs + meta_bugs) bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call). if bug["id"] in bugs_set or len(bug["blocks"]) > 0: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs, self.get_blocking_of(meta_bugs)) self.generate_component_connections(bug_map, bugs) self.generate_component_test_stats(bug_map, test_infos)
def test_download_xz(tmp_path, mock_xz): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" url_xz = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz" db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1) responses.add( responses.HEAD, url_zst, status=404, headers={"ETag": "123", "Accept-Encoding": "zstd"}, ) responses.add( responses.GET, url_zst, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) responses.add( responses.HEAD, url_xz, status=200, headers={"ETag": "123", "Accept-Encoding": "xz", "Last-Modified": "2019-04-16"}, ) tmp_xz_path = tmp_path / "prova_tmp.xz" mock_xz(tmp_xz_path) with open(tmp_xz_path, "rb") as content: responses.add(responses.GET, url_xz, status=200, body=content.read()) db.download(db_path) assert db.last_modified(db_path) == datetime(2019, 4, 16) assert os.path.exists(db_path) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".xz")) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".xz.etag"))
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids( {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()} ) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between( two_years_and_six_months_ago, six_months_ago ) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = ( timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids ) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set ) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)