def select_tests( self, commits: Sequence[repository.CommitDict], confidence: float = 0.5, push_num: Optional[int] = None, ) -> Dict[str, float]: commit_data = commit_features.merge_commits(commits) past_failures_data = test_scheduling.get_past_failures(self.granularity, True) if push_num is None: push_num = past_failures_data["push_num"] + 1 all_runnables = past_failures_data["all_runnables"] commit_tests = [] for data in test_scheduling.generate_data( self.granularity, past_failures_data, commit_data, push_num, all_runnables, tuple(), tuple(), ): commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = self.classify(commit_tests, probabilities=True) selected_indexes = np.argwhere(probs[:, 1] >= confidence)[:, 0] return { commit_tests[i]["test_job"]["name"]: math.floor(probs[i, 1] * 100) / 100 for i in selected_indexes }
def get_runnables(granularity): past_failures_data = test_scheduling.get_past_failures(granularity) push_num = past_failures_data["push_num"] all_runnables = past_failures_data["all_runnables"] commit_tests = [] for data in test_scheduling.generate_data(past_failures_data, commit_data, push_num, all_runnables, [], []): if granularity == "label" and not data["name"].startswith("test-"): continue commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = MODEL_CACHE.get(f"test{granularity}select").classify( commit_tests, probabilities=True) selected_indexes = np.argwhere( probs[:, 1] > test_selection_threshold)[:, 0] return { commit_tests[i]["test_job"]["name"]: math.floor(probs[i, 1] * 100) / 100 for i in selected_indexes }
def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.repo_dir = repo_dir self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label") self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def get_config_specific_groups(config: str) -> str: from bugbug_http.app import JobInfo job = JobInfo(get_config_specific_groups, config) LOGGER.info(f"Processing {job}...") testgroupselect_model = MODEL_CACHE.get("testgroupselect") equivalence_sets = testgroupselect_model._get_equivalence_sets(0.9) past_failures_data = test_scheduling.get_past_failures("group", True) all_runnables = past_failures_data["all_runnables"] setkey( job.result_key, orjson.dumps( [ {"name": group} for group in all_runnables if any( equivalence_set == {config} for equivalence_set in equivalence_sets[group] ) ] ), compress=True, ) return "OK"
def select_tests(self, commits, confidence=0.3, push_num=None): commit_data = commit_features.merge_commits(commits) past_failures_data = test_scheduling.get_past_failures( self.granularity) if push_num is None: push_num = past_failures_data["push_num"] + 1 all_runnables = past_failures_data["all_runnables"] if self.granularity == "label": all_runnables = tuple(r for r in all_runnables if r.startswith("test-")) commit_tests = [] for data in test_scheduling.generate_data(past_failures_data, commit_data, push_num, all_runnables, tuple(), tuple()): commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = self.classify(commit_tests, probabilities=True) selected_indexes = np.argwhere(probs[:, 1] >= confidence)[:, 0] return { commit_tests[i]["test_job"]["name"]: math.floor(probs[i, 1] * 100) / 100 for i in selected_indexes }
def mock_schedule_tests_classify(monkeypatch): with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ f"test-{granularity}1", f"test-{granularity}2", "test-linux64/opt", "test-windows10/opt", ] past_failures_data.close() failing_together = test_scheduling.get_failing_together_db() failing_together[b"test-linux64/opt$test-windows10/opt"] = struct.pack( "ff", 0.1, 1.0) test_scheduling.close_failing_together_db() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: runnable_name = item["test_job"]["name"] if self.granularity == "label": if runnable_name in labels_to_choose: results.append([ 1 - labels_to_choose[runnable_name], labels_to_choose[runnable_name], ]) else: results.append([0.9, 0.1]) elif self.granularity == "group": if runnable_name in groups_to_choose: results.append([ 1 - groups_to_choose[runnable_name], groups_to_choose[runnable_name], ]) else: results.append([0.9, 0.1]) return np.array(results) class MockModelCache: def get(self, model_name): if "group" in model_name: return bugbug.models.testselect.TestGroupSelectModel() else: return bugbug.models.testselect.TestLabelSelectModel() monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache()) monkeypatch.setattr(bugbug.models.testselect.TestSelectModel, "classify", classify) return do_mock
def __init__(self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = self.load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" if not os.path.exists(model_data_X_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")) zstd_decompress(model_data_X_path) assert os.path.exists( model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" if not os.path.exists(model_data_y_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")) zstd_decompress(model_data_y_path) assert os.path.exists( model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) if model_name == "testselect": self.use_test_history = True assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB) self.past_failures_data = test_scheduling.get_past_failures() self.backout_model = self.load_model("backout") assert self.backout_model is not None
def mock_get_config_specific_groups( monkeypatch: MonkeyPatch, ) -> None: with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. past_failures_data = test_scheduling.get_past_failures("group", False) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ "test-group1", "test-group2", ] past_failures_data.close() try: test_scheduling.close_failing_together_db("config_group") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("config_group", False) failing_together[b"$ALL_CONFIGS$"] = pickle.dumps( ["test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*"] ) failing_together[b"$CONFIGS_BY_GROUP$"] = pickle.dumps( { "test-group1": { "test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*", }, "test-group2": { "test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*", }, } ) failing_together[b"test-group1"] = pickle.dumps( { "test-linux1804-64/opt-*": { "test-windows10/debug-*": (1.0, 0.0), "test-windows10/opt-*": (1.0, 0.0), }, "test-windows10/debug-*": { "test-windows10/opt-*": (1.0, 1.0), }, } ) test_scheduling.close_failing_together_db("config_group") monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache())
def mock_schedule_tests_classify(monkeypatch): # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ f"test-{granularity}1", f"test-{granularity}2", ] past_failures_data.close() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. class Model: def __init__(self, name): self.name = name def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: runnable_name = item["test_job"]["name"] if self.name == "testlabelselect": if runnable_name in labels_to_choose: results.append( [ 1 - labels_to_choose[runnable_name], labels_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) elif self.name == "testgroupselect": if runnable_name in groups_to_choose: results.append( [ 1 - groups_to_choose[runnable_name], groups_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) return np.array(results) class MockModelCache: def get(self, model_name): return Model(model_name) monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache()) return do_mock
def _get_equivalence_sets(self, min_redundancy_confidence: float): try: with open( f"equivalence_sets_{min_redundancy_confidence}.pickle", "rb" ) as f: return pickle.load(f) except FileNotFoundError: past_failures_data = test_scheduling.get_past_failures( self.granularity, True ) all_runnables = past_failures_data["all_runnables"] equivalence_sets = {} failing_together = test_scheduling.get_failing_together_db( "config_group", True ) all_configs = pickle.loads(failing_together[b"$ALL_CONFIGS$"]) configs_by_group = pickle.loads(failing_together[b"$CONFIGS_BY_GROUP$"]) for group in all_runnables: key = test_scheduling.failing_together_key(group) try: failing_together_stats = pickle.loads(failing_together[key]) except KeyError: failing_together_stats = {} def load_failing_together( config: str, ) -> Dict[str, Tuple[float, float]]: return failing_together_stats[config] configs = ( configs_by_group[group] if group in configs_by_group else all_configs ) equivalence_sets[group] = self._generate_equivalence_sets( configs, min_redundancy_confidence, load_failing_together, True ) with open( f"equivalence_sets_{min_redundancy_confidence}.pickle", "wb" ) as f: pickle.dump(equivalence_sets, f) return equivalence_sets
def mock_schedule_tests_classify(tmpdir, monkeypatch): os.chdir(tmpdir) os.makedirs("data") # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ f"test-{granularity}1", f"test-{granularity}2", ] past_failures_data.close() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. class Model: def __init__(self, name): self.name = name def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: if self.name == "testlabelselect": if item["test_job"]["name"] in labels_to_choose: results.append([0.1, 0.9]) else: results.append([0.9, 0.1]) elif self.name == "testgroupselect": if item["test_job"]["name"] in groups_to_choose: results.append([0.1, 0.9]) else: results.append([0.9, 0.1]) return np.array(results) def mock_get_model(modelname): return Model(modelname) monkeypatch.setattr(bugbug_http.models, "get_model", mock_get_model) return do_mock
def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close()
def mock_schedule_tests_classify( monkeypatch: MonkeyPatch, ) -> Callable[[dict[str, float], dict[str, float]], None]: with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity, False) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ "test-linux1804-64-opt-label1", "test-linux1804-64-opt-label2", "test-group1", "test-group2", "test-linux1804-64/opt", "test-windows10/opt", ] past_failures_data.close() try: test_scheduling.close_failing_together_db("label") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("label", False) failing_together[b"test-linux1804-64/opt"] = pickle.dumps( { "test-windows10/opt": (0.1, 1.0), } ) test_scheduling.close_failing_together_db("label") try: test_scheduling.close_failing_together_db("config_group") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("config_group", False) failing_together[b"$ALL_CONFIGS$"] = pickle.dumps( ["test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt"] ) failing_together[b"$CONFIGS_BY_GROUP$"] = pickle.dumps( { "test-group1": { "test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt", }, "test-group2": { "test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt", }, } ) failing_together[b"test-group1"] = pickle.dumps( { "test-linux1804-64/opt": { "test-windows10/debug": (1.0, 0.0), "test-windows10/opt": (1.0, 1.0), }, "test-windows10/debug": { "test-windows10/opt": (1.0, 0.0), }, } ) test_scheduling.close_failing_together_db("config_group") try: test_scheduling.close_touched_together_db() except AssertionError: pass test_scheduling.get_touched_together_db(False) test_scheduling.close_touched_together_db() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: runnable_name = item["test_job"]["name"] if self.granularity == "label": if runnable_name in labels_to_choose: results.append( [ 1 - labels_to_choose[runnable_name], labels_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) elif self.granularity == "group": if runnable_name in groups_to_choose: results.append( [ 1 - groups_to_choose[runnable_name], groups_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) return np.array(results) monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache()) monkeypatch.setattr( bugbug.models.testselect.TestSelectModel, "classify", classify ) return do_mock
def __init__( self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir ): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB ) self.past_failures_data = test_scheduling.get_past_failures() self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def test_generate_data(granularity): past_failures = test_scheduling.get_past_failures(granularity) commits = [ { "types": ["C/C++"], "files": ["dom/file1.cpp"], "directories": ["dom"], "components": ["DOM"], }, { "types": ["C/C++"], "files": ["dom/file1.cpp", "dom/file2.cpp"], "directories": ["dom"], "components": ["DOM"], }, { "types": ["C/C++"], "files": ["layout/file.cpp"], "directories": ["layout"], "components": ["Layout"], }, { "types": ["C/C++"], "files": ["layout/file.cpp"], "directories": ["layout"], "components": ["Layout"], }, { "types": ["JavaScript", "C/C++"], "files": ["dom/file1.cpp", "dom/file1.js"], "directories": ["dom"], "components": ["DOM"], }, ] data = list( test_scheduling.generate_data(past_failures, commits[0], 1, ["runnable1", "runnable2"], [], [])) assert len(data) == 2 assert data[0] == { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } assert data[1] == { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable2", "touched_together_directories": 0, "touched_together_files": 0, } data = list( test_scheduling.generate_data(past_failures, commits[1], 2, ["runnable1", "runnable2"], ["runnable1"], [])) assert len(data) == 2 assert data[0] == { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } assert data[1] == { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable2", "touched_together_directories": 0, "touched_together_files": 0, } data = list( test_scheduling.generate_data(past_failures, commits[2], 3, ["runnable1", "runnable2"], [], ["runnable2"])) assert len(data) == 2 assert data[0] == { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 1, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 1, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 1, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } assert data[1] == { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": True, "is_possible_regression": False, "name": "runnable2", "touched_together_directories": 0, "touched_together_files": 0, } data = list( test_scheduling.generate_data(past_failures, commits[3], 4, ["runnable1"], [], [])) assert len(data) == 1 assert data[0] == { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 1, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 1, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 1, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } data = list( test_scheduling.generate_data( past_failures, commits[4], 1500, ["runnable1", "runnable2"], ["runnable1", "runnable2"], [], )) assert len(data) == 2 assert data[0] == { "failures": 1, "failures_in_components": 1, "failures_in_directories": 1, "failures_in_files": 1, "failures_in_types": 1, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 1, "failures_past_2800_pushes_in_directories": 1, "failures_past_2800_pushes_in_files": 1, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } assert data[1] == { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable2", "touched_together_directories": 0, "touched_together_files": 0, } data = list( test_scheduling.generate_data( past_failures, commits[4], 2400, ["runnable1", "runnable2"], ["runnable1", "runnable2"], [], )) assert len(data) == 2 assert data[0] == { "failures": 2, "failures_in_components": 2, "failures_in_directories": 2, "failures_in_files": 3, "failures_in_types": 3, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 1, "failures_past_1400_pushes_in_directories": 1, "failures_past_1400_pushes_in_files": 2, "failures_past_1400_pushes_in_types": 2, "failures_past_2800_pushes": 2, "failures_past_2800_pushes_in_components": 2, "failures_past_2800_pushes_in_directories": 2, "failures_past_2800_pushes_in_files": 3, "failures_past_2800_pushes_in_types": 3, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", "touched_together_directories": 0, "touched_together_files": 0, } assert data[1] == { "failures": 2, "failures_in_components": 1, "failures_in_directories": 1, "failures_in_files": 2, "failures_in_types": 3, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 1, "failures_past_1400_pushes_in_directories": 1, "failures_past_1400_pushes_in_files": 2, "failures_past_1400_pushes_in_types": 2, "failures_past_2800_pushes": 2, "failures_past_2800_pushes_in_components": 1, "failures_past_2800_pushes_in_directories": 1, "failures_past_2800_pushes_in_files": 2, "failures_past_2800_pushes_in_types": 3, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable2", "touched_together_directories": 0, "touched_together_files": 0, }
def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures( granularity, False) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for ( i, ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue # Skip wptsync commits, since they are not like normal pushes made by developers. if any(repository.is_wptsync(commit) for commit in commits): continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( granularity, past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close()
def evaluation(self) -> None: # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes(False) # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using # only failure data from the training pushes (otherwise, we'd leak training information into the test # set). print("Generate failing together DB (restricted to training pushes)") push_data_iter, push_data_count, _ = test_scheduling.get_push_data( "label" if self.granularity == "label" else "config_group" ) test_scheduling.generate_failing_together_probabilities( "label" if self.granularity == "label" else "config_group", push_data_iter(), push_data_count, pushes[train_push_len - 1]["revs"][0], ) test_pushes_list = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, ( set(push["failures"]) | set(push["passes"]) for push in test_pushes_list[-28:] ), ) all_revs = set(sum((push["revs"] for push in test_pushes_list), [])) test_pushes_failures = sum( 1 for push in test_pushes_list if len(push["failures"]) > 0 ) test_pushes = {push["revs"][0]: push for push in test_pushes_list} if self.granularity == "group": for ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ) in tqdm(push_data_iter(), total=push_data_count): if revisions[0] not in test_pushes: continue test_pushes[revisions[0]]["config_group_failures"] = ( possible_regressions + likely_regressions ) print( f"Testing on {len(test_pushes)} ({test_pushes_failures} with failures) out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) del pushes commit_map = get_commit_map(all_revs) past_failures_data = test_scheduling.get_past_failures(self.granularity, True) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, push in enumerate(tqdm(test_pushes.values())): commits = tuple( commit_map.pop(revision) for revision in push["revs"] if revision in commit_map ) if len(commits) == 0: push["all_possibly_selected"] = {} continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. push["all_possibly_selected"] = self.select_tests( commits, 0.5, push_num - 100 ) def do_eval( executor: concurrent.futures.ProcessPoolExecutor, confidence_threshold: float, reduction: Optional[float], cap: Optional[int], minimum: Optional[int], ) -> None: futures: Dict[concurrent.futures.Future, Dict[str, Any]] = {} for push in test_pushes.values(): futures[ executor.submit( eval_apply_transforms, self, push, confidence_threshold, reduction, cap, minimum, ) ] = push for future in concurrent.futures.as_completed(futures): exc = future.exception() if exc is not None: print( "Exception {} while running {}".format( exc, futures[future]["revs"][0] ) ) for f in futures: f.cancel() push = futures[future] selected, group_configs = future.result() if reduction is not None and self.granularity == "group": push["number_configs"] = len( set( sum( group_configs.values(), [], ) ) ) selected_config_groups = set( (config, group) for group, configs in group_configs.items() for config in configs ) caught_config_groups = selected_config_groups & set( push["config_group_failures"] ) push["caught_one_config_group"] = ( len(caught_config_groups) > 0 if len(push["config_group_failures"]) != 0 else None ) push["caught_percentage_config_group"] = ( len(caught_config_groups) / len(push["config_group_failures"]) if len(push["config_group_failures"]) != 0 else None ) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = ( len(caught) > 0 if len(push["failures"]) != 0 else None ) push["some_didnt_run"] = ( not selected.issubset(set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = ( len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None ) min_scheduled = min( result["number_scheduled"] for result in test_pushes.values() ) max_scheduled = max( result["number_scheduled"] for result in test_pushes.values() ) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values() ) num_failing_pushes = sum( 1 for result in test_pushes.values() if result["caught_one"] is not None ) num_caught_one = sum( 1 for result in test_pushes.values() if result["caught_one"] ) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or (result["caught_one"] is not None and result["some_didnt_run"]) ) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes ) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None ) reduction_str = ( f"enabled at {reduction * 100}%" if reduction is not None else "disabled" ) message = f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." if reduction is not None and self.granularity == "group": average_configs = statistics.mean( result["number_configs"] for result in test_pushes.values() ) median_configs = statistics.median( result["number_configs"] for result in test_pushes.values() ) message += f" On average, we selected {average_configs} configs (a median of {median_configs} configs)." num_caught_one_config_group = sum( 1 for result in test_pushes.values() if result["caught_one_config_group"] ) percentage_caught_one_config_group = ( 100 * num_caught_one_config_group / num_failing_pushes ) average_caught_percentage_config_group = 100 * statistics.mean( result["caught_percentage_config_group"] for result in test_pushes.values() if result["caught_percentage_config_group"] is not None ) message += f" In {percentage_caught_one_config_group}% of pushes we caught at least one config/group failure. On average, we caught {average_caught_percentage_config_group}% of all seen config/group failures." print(message) with concurrent.futures.ProcessPoolExecutor( max_workers=utils.get_physical_cpu_count() ) as executor: scenarios = [ (None, None, None), (10, None, None), (None, 300, None), (None, None, 0.9), (None, None, 1.0), ] for minimum, cap, reduction in scenarios: # Pre-generate equivalence sets, so when we run the config selection in multiple processes # we don't risk concurrent writes to the equivalence sets file. if reduction is not None and self.granularity == "group": self._get_equivalence_sets(reduction) for confidence_threshold in [0.5, 0.7, 0.8, 0.85, 0.9, 0.95]: do_eval(executor, confidence_threshold, reduction, cap, minimum)
def evaluation(self): # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes(False) # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using # only failure data from the training pushes (otherwise, we'd leak training information into the test # set). if self.granularity == "label": print( "Generate failing together DB (restricted to training pushes)") push_data, _ = test_scheduling.get_push_data("label") test_scheduling.generate_failing_together_probabilities( push_data, pushes[train_push_len - 1]["revs"][0]) test_pushes = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, (set(push["failures"]) | set(push["passes"]) for push in test_pushes[-28:]), ) test_pushes_failures = sum(1 for push in test_pushes if len(push["failures"]) > 0) test_pushes = {push["revs"][0]: push for push in test_pushes} print( f"Testing on {len(test_pushes)} ({test_pushes_failures} with failures) out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) commit_map = get_commit_map() past_failures_data = test_scheduling.get_past_failures( self.granularity) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, (rev, push) in enumerate(tqdm(test_pushes.items())): commits = tuple(commit_map[revision] for revision in push["revs"] if revision in commit_map) if len(commits) == 0: test_pushes[rev]["all_possibly_selected"] = {} continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. test_pushes[rev]["all_possibly_selected"] = self.select_tests( commits, 0.3, push_num - 100) reductions = [None] if self.granularity == "label": reductions += [0.9, 1.0] def do_eval(confidence_threshold, reduction, cap, minimum): for rev, push in test_pushes.items(): selected = set(name for name, confidence in push["all_possibly_selected"].items() if confidence >= confidence_threshold) if minimum is not None and len(selected) < minimum: remaining = [(name, confidence) for name, confidence in push["all_possibly_selected"].items() if name not in selected] selected.update(name for name, _ in sorted( remaining, key=lambda x: -x[1])[:minimum - len(selected)]) if reduction is not None: selected = self.reduce(selected, reduction) if cap is not None and len(selected) > cap: selected = set( sorted( ((name, confidence) for name, confidence in push["all_possibly_selected"].items() if name in selected), key=lambda x: x[1], reverse=True, )[:cap]) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = (len(caught) > 0 if len(push["failures"]) != 0 else None) push["some_didnt_run"] = (not selected.issubset( set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = (len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None) min_scheduled = min(result["number_scheduled"] for result in test_pushes.values()) max_scheduled = max(result["number_scheduled"] for result in test_pushes.values()) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values()) num_failing_pushes = sum(1 for result in test_pushes.values() if result["caught_one"] is not None) num_caught_one = sum(1 for result in test_pushes.values() if result["caught_one"]) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or (result["caught_one"] is not None and result["some_didnt_run"])) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None) reduction_str = (f"enabled at {reduction * 100}%" if reduction is not None else "disabled") print( f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, and cap at {cap}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." ) for minimum in [None, 10]: for cap in [None, 300, 500]: for reduction in reductions: for confidence_threshold in [ 0.5, 0.7, 0.8, 0.85, 0.9, 0.95 ]: do_eval(confidence_threshold, reduction, cap, minimum)
def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close()
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def evaluation(self): # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes() test_pushes = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, (set(push["failures"]) | set(push["passes"]) for push in test_pushes[-28:]), ) test_pushes = {push["revs"][0]: push for push in test_pushes} print( f"Testing on {len(pushes) - train_push_len} out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) commit_map = get_commit_map() past_failures_data = test_scheduling.get_past_failures( self.granularity) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, (rev, push) in enumerate(tqdm(test_pushes.items())): commits = tuple(commit_map[revision] for revision in push["revs"] if revision in commit_map) if len(commits) == 0: continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. test_pushes[rev]["all_possibly_selected"] = self.select_tests( commits, 0.3, push_num - 100) reductions = [None] if self.granularity == "label": reductions += [0.7, 0.8, 0.9, 1.0] for reduction in reductions: for confidence_threshold in [0.3, 0.5, 0.7, 0.8]: for rev, push in test_pushes.items(): selected = set(name for name, confidence in push["all_possibly_selected"].items() if confidence >= confidence_threshold) if reduction is not None: selected = self.reduce(selected, reduction) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = (len(caught) > 0 if len(push["failures"]) != 0 else None) push["some_didnt_run"] = (not selected.issubset( set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = (len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None) min_scheduled = min(result["number_scheduled"] for result in test_pushes.values()) max_scheduled = max(result["number_scheduled"] for result in test_pushes.values()) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values()) num_failing_pushes = sum(1 for result in test_pushes.values() if result["caught_one"] is not None) num_caught_one = sum(1 for result in test_pushes.values() if result["caught_one"]) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or ( result["caught_one"] is not None and result["some_didnt_run"])) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None) reduction_str = (f"enabled at {reduction * 100}%" if reduction is not None else "disabled") print( f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." )
def test_select_configs(failing_together_config_group: LMDBDict) -> None: past_failures_data = test_scheduling.get_past_failures("group", False) past_failures_data["all_runnables"] = ["group1", "group2"] past_failures_data.close() failing_together_config_group[b"group1"] = pickle.dumps({ "linux1804-64-asan/debug": { "linux1804-64/debug": (1.0, 0.0), "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "linux1804-64/debug": { "linux1804-64/opt": (1.0, 1.0), "mac/debug": (1.0, 1.0), "windows10/debug": (1.0, 1.0), }, "linux1804-64/opt": { "mac/debug": (1.0, 1.0), "windows10/debug": (1.0, 1.0), }, "mac/debug": { "windows10/debug": (1.0, 1.0) }, }) failing_together_config_group[b"group2"] = pickle.dumps({ "linux1804-64-asan/debug": { "linux1804-64/debug": (1.0, 1.0), "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "linux1804-64/debug": { "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 1.0), }, "linux1804-64/opt": { "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "mac/debug": { "windows10/debug": (1.0, 0.0) }, }) failing_together_config_group[b"$ALL_CONFIGS$"] = pickle.dumps([ "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", ]) failing_together_config_group[b"$CONFIGS_BY_GROUP$"] = pickle.dumps({ "group1": { "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", }, "group2": { "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", }, }) test_scheduling.close_failing_together_db("config_group") model = TestGroupSelectModel() result = model.select_configs( { "group1", "group2", }, 1.0, ) assert len(result) == 2 assert set( result["group1"]) == {"linux1804-64-asan/debug", "linux1804-64/opt"} assert set(result["group2"]) == { "linux1804-64/opt", "mac/debug", "linux1804-64/debug", }
def test_generate_data(granularity: str) -> None: past_failures = test_scheduling.get_past_failures(granularity, False) commits = [ CommitDict({ "types": ["C/C++"], "files": ["dom/file1.cpp"], "directories": ["dom"], "components": ["DOM"], }), CommitDict({ "types": ["C/C++"], "files": ["dom/file1.cpp", "dom/file2.cpp"], "directories": ["dom"], "components": ["DOM"], }), CommitDict({ "types": ["C/C++"], "files": ["layout/file.cpp"], "directories": ["layout"], "components": ["Layout"], }), CommitDict({ "types": ["C/C++"], "files": ["layout/file.cpp"], "directories": ["layout"], "components": ["Layout"], }), CommitDict({ "types": ["JavaScript", "C/C++"], "files": ["dom/file1.cpp", "dom/file1.js"], "directories": ["dom"], "components": ["DOM"], }), ] data = list( test_scheduling.generate_data( granularity, past_failures, commits[0], 1, ["runnable1", "runnable2"], [], [], )) assert len(data) == 2 obj = { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj obj = { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable2", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[1] == obj data = list( test_scheduling.generate_data( granularity, past_failures, commits[1], 2, ["runnable1", "runnable2"], ["runnable1"], [], )) assert len(data) == 2 obj = { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj obj = { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable2", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[1] == obj data = list( test_scheduling.generate_data( granularity, past_failures, commits[2], 3, ["runnable1", "runnable2"], [], ["runnable2"], )) assert len(data) == 2 obj = { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 1, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 1, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 1, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj obj = { "failures": 0, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 0, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 0, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 0, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": True, "is_possible_regression": False, "name": "runnable2", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[1] == obj data = list( test_scheduling.generate_data(granularity, past_failures, commits[3], 4, ["runnable1"], [], [])) assert len(data) == 1 obj = { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 1, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 1, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 1, "is_likely_regression": False, "is_possible_regression": False, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj data = list( test_scheduling.generate_data( granularity, past_failures, commits[4], 1500, ["runnable1", "runnable2"], ["runnable1", "runnable2"], [], )) assert len(data) == 2 obj = { "failures": 1, "failures_in_components": 1, "failures_in_directories": 1, "failures_in_files": 1, "failures_in_types": 1, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 1, "failures_past_2800_pushes_in_directories": 1, "failures_past_2800_pushes_in_files": 1, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj obj = { "failures": 1, "failures_in_components": 0, "failures_in_directories": 0, "failures_in_files": 0, "failures_in_types": 1, "failures_past_1400_pushes": 0, "failures_past_1400_pushes_in_components": 0, "failures_past_1400_pushes_in_directories": 0, "failures_past_1400_pushes_in_files": 0, "failures_past_1400_pushes_in_types": 0, "failures_past_2800_pushes": 1, "failures_past_2800_pushes_in_components": 0, "failures_past_2800_pushes_in_directories": 0, "failures_past_2800_pushes_in_files": 0, "failures_past_2800_pushes_in_types": 1, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable2", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[1] == obj data = list( test_scheduling.generate_data( granularity, past_failures, commits[4], 2400, ["runnable1", "runnable2"], ["runnable1", "runnable2"], [], )) assert len(data) == 2 obj = { "failures": 2, "failures_in_components": 2, "failures_in_directories": 2, "failures_in_files": 3, "failures_in_types": 3, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 1, "failures_past_1400_pushes_in_directories": 1, "failures_past_1400_pushes_in_files": 2, "failures_past_1400_pushes_in_types": 2, "failures_past_2800_pushes": 2, "failures_past_2800_pushes_in_components": 2, "failures_past_2800_pushes_in_directories": 2, "failures_past_2800_pushes_in_files": 3, "failures_past_2800_pushes_in_types": 3, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable1", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[0] == obj obj = { "failures": 2, "failures_in_components": 1, "failures_in_directories": 1, "failures_in_files": 2, "failures_in_types": 3, "failures_past_1400_pushes": 1, "failures_past_1400_pushes_in_components": 1, "failures_past_1400_pushes_in_directories": 1, "failures_past_1400_pushes_in_files": 2, "failures_past_1400_pushes_in_types": 2, "failures_past_2800_pushes": 2, "failures_past_2800_pushes_in_components": 1, "failures_past_2800_pushes_in_directories": 1, "failures_past_2800_pushes_in_files": 2, "failures_past_2800_pushes_in_types": 3, "failures_past_700_pushes": 0, "failures_past_700_pushes_in_components": 0, "failures_past_700_pushes_in_directories": 0, "failures_past_700_pushes_in_files": 0, "failures_past_700_pushes_in_types": 0, "is_likely_regression": False, "is_possible_regression": True, "name": "runnable2", } if granularity == "group": obj["touched_together_directories"] = 0 obj["touched_together_files"] = 0 assert data[1] == obj