def get_pushes( self, apply_filters: bool = False ) -> Tuple[List[Dict[str, List[str]]], float]: pushes = [] for revs, test_datas in test_scheduling.get_test_scheduling_history( self.granularity): failures = [] passes = [] for test_data in test_datas: name = test_data["name"] if self.granularity == "label" and not name.startswith( "test-"): continue if (test_data["is_likely_regression"] or test_data["is_possible_regression"]): failures.append(name) else: passes.append(name) if apply_filters: if self.failures_skip and len(failures) > self.failures_skip: continue pushes.append({ "revs": revs, "failures": failures, "passes": passes, }) return pushes, math.floor(0.9 * len(pushes))
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 for test_data in test_scheduling.get_test_scheduling_history( self.granularity): revs = test_data["revs"] name = test_data["name"] if (revs[0], name) not in classes: continue commits = tuple(commit_map[revision] for revision in test_data["revs"] if revision in commit_map) if len(commits) == 0: continue commit_data = commit_features.merge_commits(commits) commit_data["test_job"] = test_data yield commit_data, classes[(revs[0], name)]
def get_pushes(self): if self.use_subset: random.seed(0) pushes = [] for revs, test_datas in test_scheduling.get_test_scheduling_history( self.granularity): failures = [] passes = [] for test_data in test_datas: name = test_data["name"] if self.granularity == "label" and not name.startswith( "test-"): continue if (test_data["is_likely_regression"] or test_data["is_possible_regression"]): failures.append(name) else: passes.append(name) if self.use_subset: passes = random.sample(passes, math.ceil(len(passes) / 10)) pushes.append({ "revs": revs, "failures": failures, "passes": passes, }) return pushes, math.floor(0.9 * len(pushes))
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 done = set() for test_data in test_scheduling.get_test_scheduling_history("label"): revs = test_data["revs"] if revs[0] in done: continue if revs[0] not in classes: continue done.add(revs[0]) commits = tuple(commit_map[revision] for revision in revs if revision in commit_map) if len(commits) == 0: continue commit_data = commit_features.merge_commits(commits) yield commit_data, classes[revs[0]]
def get_pushes( self, apply_filters: bool = False ) -> tuple[list[dict[str, Any]], int]: pushes = [] for revs, test_datas in test_scheduling.get_test_scheduling_history( self.granularity ): failures = [] passes = [] for test_data in test_datas: name = test_data["name"] if ( test_data["is_likely_regression"] or test_data["is_possible_regression"] ): failures.append(name) else: passes.append(name) if apply_filters: if self.failures_skip and len(failures) > self.failures_skip: continue pushes.append( { "revs": revs, "failures": failures, "passes": passes, } ) return pushes, math.floor(0.9 * len(pushes))
def get_labels(self): classes = {} for revs, test_datas in test_scheduling.get_test_scheduling_history("label"): rev = revs[0] if any( test_data["is_likely_regression"] or test_data["is_possible_regression"] for test_data in test_datas ): classes[rev] = 1 else: classes[rev] = 0 print( "{} commits failed".format( sum(1 for label in classes.values() if label == 1) ) ) print( "{} commits did not fail".format( sum(1 for label in classes.values() if label == 0) ) ) return classes, [0, 1]
def get_labels(self): classes = {} classes_by_rev = defaultdict(dict) random.seed(0) revs = set() for test_data in test_scheduling.get_test_scheduling_history( self.granularity): rev = test_data["revs"][0] name = test_data["name"] revs.add(rev) if self.granularity == "label" and not name.startswith("test-"): continue if test_data["is_likely_regression"] or test_data[ "is_possible_regression"]: classes_by_rev[rev][name] = 1 else: classes_by_rev[rev][name] = 0 if self.use_subset: for rev, by_name in classes_by_rev.items(): passing_names = [ name for name, val in by_name.items() if val == 0 ] if len(passing_names) == 0: continue chosen_passing_names = random.sample( passing_names, math.ceil(len(passing_names) / 10)) assert len(chosen_passing_names) > 0 to_delete = set(passing_names) - set(chosen_passing_names) for name in to_delete: del by_name[name] classes = {(rev, name): val for rev, by_name in classes_by_rev.items() for name, val in by_name.items()} print("{} pushes considered".format(len(classes_by_rev))) print("{} push/jobs failed".format( sum(1 for label in classes.values() if label == 1))) print("{} push/jobs did not fail".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def get_labels(self): classes = {} pushes = {} for test_data in test_scheduling.get_test_scheduling_history( self.granularity): rev = test_data["revs"][0] name = test_data["name"] if self.granularity == "label" and not name.startswith("test-"): continue if rev not in pushes: pushes[rev] = { "failures": [], "passes": [], } if test_data["is_likely_regression"] or test_data[ "is_possible_regression"]: pushes[rev]["failures"].append(name) else: pushes[rev]["passes"].append(name) if self.use_subset: random.seed(0) for rev, push in pushes.items(): push["passes"] = random.sample( push["passes"], math.ceil(len(push["passes"]) / 10)) for rev, push in pushes.items(): for name in push["failures"]: classes[(rev, name)] = 1 for name in push["passes"]: classes[(rev, name)] = 0 print("{} pushes considered".format(len(pushes))) print("{} pushes with at least one failure".format( sum(1 for push in pushes.values() if len(push["failures"]) > 0))) print("{} push/jobs failed".format( sum(1 for label in classes.values() if label == 1))) print("{} push/jobs did not fail".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def get_labels(self): classes = {} for test_data in test_scheduling.get_test_scheduling_history(): rev = test_data["revs"][0] if test_data["is_likely_regression"] or test_data[ "is_possible_regression"]: classes[rev] = 1 elif rev not in classes: classes[rev] = 0 print("{} commits failed".format( sum(1 for label in classes.values() if label == 1))) print("{} commits did not fail".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def items_gen(self, classes): commit_map = get_commit_map() for revs, test_datas in test_scheduling.get_test_scheduling_history( self.granularity): commits = tuple(commit_map[revision] for revision in revs if revision in commit_map) if len(commits) == 0: continue for test_data in test_datas: name = test_data["name"] if (revs[0], name) not in classes: continue commit_data = commit_features.merge_commits(commits) commit_data["test_job"] = test_data yield commit_data, classes[(revs[0], name)]
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 for revs, test_datas in test_scheduling.get_test_scheduling_history("label"): if revs[0] not in classes: continue commits = tuple( commit_map[revision] for revision in revs if revision in commit_map ) if len(commits) == 0: continue commit_data = commit_features.merge_commits(commits) yield commit_data, classes[revs[0]]
def train_test_split(self, X, y): pushes = OrderedDict() for test_data in test_scheduling.get_test_scheduling_history( self.granularity): rev = test_data["revs"][0] name = test_data["name"] if self.granularity == "label" and not name.startswith("test-"): continue if rev in pushes: pushes[rev] += 1 else: pushes[rev] = 1 train_push_len = math.floor(0.9 * len(pushes)) train_pushes = list(pushes.values())[:train_push_len] train_len = sum(count for count in train_pushes) print( f"{train_push_len} pushes in the training set (corresponding to {train_len} push/jobs)" ) return X[:train_len], X[train_len:], y[:train_len], y[train_len:]
def get_labels(self): classes = {} for test_data in test_scheduling.get_test_scheduling_history(): rev = test_data["revs"][0] name = test_data["name"] if not name.startswith("test-"): continue if test_data["is_likely_regression"] or test_data[ "is_possible_regression"]: classes[(rev, name)] = 1 else: classes[(rev, name)] = 0 print("{} commit/jobs failed".format( sum(1 for label in classes.values() if label == 1))) print("{} commit/jobs did not fail".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 # TODO: Data from multiple commits in the same push should be merged. for test_data in test_scheduling.get_test_scheduling_history(): rev = test_data["revs"][0] name = test_data["name"] if (rev, name) not in classes: continue if rev not in commit_map: continue commit_data = commit_map[rev] commit_data["test_job"] = test_data yield commit_data, classes[(rev, name)]
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity)) if updated: zstd_decompress(push_data_path) assert os.path.exists( push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity]) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) db.download(test_scheduling_db, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history( granularity): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])) # Filter runnables we don't need. all_runnables = filter_runnables(list(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables_set) logger.info( f"{len(all_runnables_set)} runnables run in the last 28 pushes" ) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # So we consider only the runnables which run in this push, and the possible and likely regressions # from this push. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) runnables_to_consider = filter_runnables( runnables_to_consider, all_runnables_set, granularity) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[:-len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[:-len(".tar.zst")])
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None try: with open("data/past_failures.pickle", "rb") as f: past_failures, push_num = pickle.load(f) except FileNotFoundError: past_failures = {} push_num = 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): if type_ not in past_failures: past_failures[type_] = {} if task not in past_failures[type_]: past_failures[type_][task] = {} values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] for item in items: if item not in past_failures[type_][task]: past_failures[type_][task][item] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) value = past_failures[type_][task][item][push_num] values_total.append(value) values_prev_7.append( value - past_failures[type_][task][item][push_num - 7] ) values_prev_14.append( value - past_failures[type_][task][item][push_num - 14] ) values_prev_28.append( value - past_failures[type_][task][item][push_num - 28] ) values_prev_56.append( value - past_failures[type_][task][item][push_num - 56] ) if is_regression: past_failures[type_][task][item][push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open("data/past_failures.pickle", "wb") as f: pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_failures.pickle")
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity) ) if updated: zstd_decompress(push_data_path) os.remove(f"{push_data_path}.zst") assert os.path.exists(push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history(granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[: -len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[: -len(".tar.zst")]) if granularity == "label": with open_tar_zst(failing_together_db) as tar: tar.add(failing_together_db[: -len(".tar.zst")])
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL) zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history(): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = (task in commit_push_data[1] or task in commit_push_data[2]) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, commit_data["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, commit_data["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")