def test_download_support_file_missing(tmp_path, capfd): url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" support_filename = "support_mock.zst" url_support = urljoin(url, support_filename) db_path = tmp_path / "prova.json" db.register(db_path, url, 1, support_files=[support_filename]) responses.add( responses.HEAD, url_support, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add( responses.GET, url_support, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) db.download_support_file(db_path, support_filename) out, err = capfd.readouterr() path = os.path.join(os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.zst") assert (out.split("\n")[-2] == f"{support_filename} is not yet available to download for {path}")
def test_download_support_file_zst(tmp_path, mock_zst): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" support_filename = "support.zst" url = urljoin(url_zst, support_filename) db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1, support_files=[support_filename]) responses.add( responses.HEAD, url, status=200, headers={"ETag": "123", "Accept-Encoding": "zstd"}, ) tmp_zst_path = tmp_path / "prova_tmp.zst" mock_zst(tmp_zst_path) with open(tmp_zst_path, "rb") as content: responses.add(responses.GET, url, status=200, body=content.read()) db.download_support_file(db_path, support_filename) assert os.path.exists(os.path.join(os.path.dirname(db_path), support_filename)) assert os.path.exists( os.path.join(os.path.dirname(db_path), os.path.splitext(support_filename)[0]) ) assert os.path.exists( os.path.join( os.path.dirname(db_path), os.path.splitext(support_filename)[0] + ".zst.etag", ) )
def boot_worker(): # Clone autoland logger.info(f"Cloning mozilla autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") # Download test scheduling DB support files. logger.info("Downloading test scheduling DB support files...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass # Preload models bugbug_http.models.preload_models() logger.info("Worker boot done")
def test_download_support_file_xz(tmp_path, mock_xz): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" url_xz = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz" support_filename = "support_mock.zst" url_zst_support = urljoin(url_zst, support_filename) url_xz_support = urljoin(url_xz, f"{os.path.splitext(support_filename)[0]}.xz") db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1, support_files=[support_filename]) responses.add( responses.HEAD, url_zst_support, status=404, headers={"ETag": "123", "Accept-Encoding": "zstd"}, ) responses.add( responses.GET, url_zst_support, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) responses.add( responses.HEAD, url_xz_support, status=200, headers={"ETag": "123", "Accept-Encoding": "xz"}, ) tmp_xz_path = tmp_path / "prova_tmp.xz" mock_xz(tmp_xz_path) with open(tmp_xz_path, "rb") as content: responses.add(responses.GET, url_xz_support, status=200, body=content.read()) db.download_support_file(db_path, support_filename) assert os.path.exists( os.path.join( os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.xz" ) ) assert os.path.exists( os.path.join(os.path.dirname(db_path), os.path.splitext(support_filename)[0]) ) assert os.path.exists( os.path.join( os.path.dirname(db_path), os.path.splitext(support_filename)[0] + ".xz.etag" ) )
def test_download_support_file_missing(tmp_path, caplog): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" support_filename = "support_mock.zst" url_support = urljoin(url, support_filename) db_path = tmp_path / "prova.json" db.register(db_path, url, 1, support_files=[support_filename]) responses.add( responses.HEAD, url_support, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add( responses.GET, url_support, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) url_fallback = url_support.replace( "https://community-tc.services.mozilla.com/api/index", "https://index.taskcluster.net", ) responses.add( responses.HEAD, url_fallback, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add( responses.GET, url_fallback, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) db.download_support_file(db_path, support_filename) path = os.path.join(os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.zst") expected_message = f"{support_filename} is not yet available to download for {path}" assert expected_message in caplog.text
def download_models(): for model_name in MODELS_NAMES: utils.download_model(model_name) # Try loading the model get_model(model_name) db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, extract=False, ) db.download_support_file(repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, extract=False) db.download(repository.COMMITS_DB, extract=False)
def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.repo_dir = repo_dir self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label") self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def test_download_support_file_missing(tmp_path, caplog): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst" url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version" support_filename = "support_mock.zst" url_support = urljoin(url, support_filename) db_path = tmp_path / "prova.json" db.register(db_path, url, 1, support_files=[support_filename]) responses.add(responses.GET, url_version, status=404) responses.add( responses.HEAD, url_support, status=404, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) responses.add( responses.GET, url_support, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) assert not db.download_support_file(db_path, support_filename) expected_message = f"Version file is not yet available to download for {db_path}" assert expected_message in caplog.text
def boot_worker(): # Preload models bugbug_http.models.preload_models() # Clone mozilla central repo_dir = os.environ.get("BUGBUG_REPO_DIR", os.path.join(tempfile.gettempdir(), "bugbug-hg")) logger.info(f"Cloning mozilla-central in {repo_dir}...") repository.clone(repo_dir) # Download databases logger.info("Downloading test scheduling DB support file...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") repository.download_commits(repo_dir, rev_start) logger.info("Worker boot done")
def __init__(self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = self.load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" if not os.path.exists(model_data_X_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")) zstd_decompress(model_data_X_path) assert os.path.exists( model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" if not os.path.exists(model_data_y_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")) zstd_decompress(model_data_y_path) assert os.path.exists( model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) if model_name == "testselect": self.use_test_history = True assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB) self.past_failures_data = test_scheduling.get_past_failures() self.backout_model = self.load_model("backout") assert self.backout_model is not None
def download_eval_dbs(self, extract: bool = True, ensure_exist: bool = True) -> None: for eval_db, eval_files in self.eval_dbs.items(): for eval_file in eval_files: if db.is_registered(eval_file): assert db.download(eval_file, extract=extract) or not ensure_exist else: assert (db.download_support_file( eval_db, eval_file, extract=extract) or not ensure_exist)
def test_download_support_file(tmp_path, mock_zst): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.version" support_filename = "support.zst" url_support = urljoin(url, support_filename) db_path = tmp_path / "prova.json" db.register(db_path, url, 1, support_files=[support_filename]) responses.add(responses.GET, url_version, status=200, body="1") responses.add( responses.HEAD, url_support, status=200, headers={ "ETag": "123", "Accept-Encoding": "zstd" }, ) tmp_zst_path = tmp_path / "prova_tmp.zst" mock_zst(tmp_zst_path) with open(tmp_zst_path, "rb") as content: responses.add(responses.GET, url_support, status=200, body=content.read()) assert db.download_support_file(db_path, support_filename) assert not os.path.exists( os.path.join(os.path.dirname(db_path), support_filename)) assert os.path.exists( os.path.join(os.path.dirname(db_path), os.path.splitext(support_filename)[0])) assert os.path.exists( os.path.join( os.path.dirname(db_path), os.path.splitext(support_filename)[0] + ".zst.etag", ))
def download_models(): for model_name in MODELS_NAMES: utils.download_model(model_name) # Try loading the model try: MODEL_CACHE.get(model_name) except FileNotFoundError: if ALLOW_MISSING_MODELS: LOGGER.info( "Missing %r model, skipping because ALLOW_MISSING_MODELS is set" % model_name ) return None else: raise db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, extract=False, ) db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, extract=False, ) db.download_support_file( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, extract=False ) db.download(repository.COMMITS_DB, extract=False)
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def __init__( self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir ): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB ) self.past_failures_data = test_scheduling.get_past_failures() self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None