def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) for entry in bug_data["history"]: for change in entry["changes"]: # Bugs that get dev-doc-needed removed from them at some point after it's been added (this suggests a false positive among human-analyzed bugs) if ( change["field_name"] == "keywords" and "dev-doc-needed" in change["removed"] and "dev-doc-complete" not in change["added"] ): classes[bug_id] = 0 # Bugs that go from dev-doc-needed to dev-doc-complete are guaranteed to be good # Bugs that go from not having dev-doc-needed to having dev-doc-complete are bugs # that were missed by previous scans through content but someone realized it # should have been flagged and updated the docs, found the docs already updated. elif change["field_name"] == "keywords" and any( keyword in change["added"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): if bug_data["assigned_to_detail"]["email"] in ADDRESSES_TO_EXCLUDE: continue bug_id = int(bug_data["id"]) classes[bug_id] = bug_data["assigned_to_detail"]["email"] assignee_counts = Counter(classes.values()).most_common() top_assignees = set( assignee for assignee, count in assignee_counts if count > MINIMUM_ASSIGNMENTS ) print(f"{len(top_assignees)} assignees") for assignee, count in assignee_counts: print(f"{assignee}: {count}") classes = { bug_id: assignee for bug_id, assignee in classes.items() if assignee in top_assignees } return classes, set(classes.values())
def get_labels(self): classes = {} for bug_id, category in labels.get_labels("tracking"): assert category in ["True", "False"], f"unexpected category {category}" classes[int(bug_id)] = 1 if category == "True" else 0 for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) for entry in bug_data["history"]: for change in entry["changes"]: if change["field_name"].startswith("cf_tracking_firefox"): if change["added"] in ["blocking", "+"]: classes[bug_id] = 1 elif change["added"] == "-": classes[bug_id] = 0 if bug_data["resolution"] in ["INVALID", "DUPLICATE"]: continue if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def test_rollback(): model = QANeededModel() histories = {} for bug in bugzilla.get_bugs(): histories[int(bug["id"])] = bug["history"] def rollback_point(bug_id): count = 0 for history in histories[bug_id]: for change in history["changes"]: if model.rollback(change): return count count += 1 return count assert ( rollback_point(1390433) == 35 ), "A bug field should start with qawanted or qe-verify" assert ( rollback_point(1389136) == 9 ), "A bug field should start with qawanted or qe-verify" assert rollback_point(1388990) == 29 assert rollback_point(1389223) == 8
def get_labels(self): product_components = {} for bug_data in bugzilla.get_bugs(): product_components[bug_data["id"]] = ( bug_data["product"], bug_data["component"], ) self.meaningful_product_components = self.get_meaningful_product_components( ( (product, component) for product, component in product_components.values() if self.is_meaningful(product, component) ) ) classes = {} for bug_id, (product, component) in product_components.items(): component = self.filter_component(product, component) if component: classes[bug_id] = component component_counts = Counter(classes.values()).most_common() top_components = set(component for component, count in component_counts) print(f"{len(top_components)} components") for component, count in component_counts: print(f"{component}: {count}") # Assert there is at least one bug for each conflated component. for conflated_component in self.CONFLATED_COMPONENTS: assert any( conflated_component == component for component, count in component_counts ), f"There should be at least one bug matching {conflated_component}*" # Assert there is at least one bug for each component the conflated components are mapped to. for conflated_component_mapping in self.CONFLATED_COMPONENTS_MAPPING.values(): assert any( conflated_component_mapping == f"{product}::{component}" for product, component in product_components.values() ), f"There should be at least one bug in {conflated_component_mapping}" # Assert all conflated components are either in conflated_components_mapping or exist as components. for conflated_component in self.CONFLATED_COMPONENTS: assert conflated_component in self.CONFLATED_COMPONENTS_MAPPING or any( conflated_component == f"{product}::{component}" for product, component in product_components.values() ), f"It should be possible to map {conflated_component}" classes = { bug_id: component for bug_id, component in classes.items() if component in top_components } return classes, set(classes.values())
def get_labels(self): classes = {} keyword_list = list(set(keyword_dict.values())) for bug_data in bugzilla.get_bugs(): target = np.zeros(len(keyword_list)) for keyword in bug_data["keywords"]: target[keyword_list.index(keyword_dict[keyword])] = 1 classes[int(bug_data["id"])] = target return classes, keyword_list
def get_inconsistencies(find_all=False): inconsistencies = [] for bug in bugzilla.get_bugs(): try: rollback(bug, None, False, find_all) except Exception as e: print(bug["id"]) print(e) inconsistencies.append(bug["id"]) return inconsistencies
def eval_tracking(self): results = [] model = TrackingModel.load('trackingmodel') for bug in bugzilla.get_bugs(): if self.is_tracking_decision_made(bug): continue if model.classify(bug)[0] == 1: results.append(bug['id']) with open('tracking.json', 'w') as f: json.dump(results, f)
def eval_regression(self): results = {} model = RegressionModel.load('regressionmodel') for bug in bugzilla.get_bugs(): if self.is_regression(bug): results[bug['id']] = True elif self.is_feature(bug): results[bug['id']] = False else: results[bug['id']] = True if model.classify(bug)[0] == 1 else False with open('regression.json', 'w') as f: json.dump(results, f)
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) for entry in bug_data["history"]: for change in entry["changes"]: if any( change["added"].startswith(label) for label in ["qawanted", "qe-verify", "qaurgent"] ): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def __init__( self, k=10, vectorizer=TfidfVectorizer(), cleanup_urls=True, nltk_tokenizer=False, ): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) self.vectorizer = vectorizer self.similarity_calculator = NearestNeighbors(n_neighbors=k) text = [] self.bug_ids = [] for bug in bugzilla.get_bugs(): text.append(self.text_preprocess(self.get_text(bug), join=True)) self.bug_ids.append(bug["id"]) self.vectorizer.fit(text) self.similarity_calculator.fit(self.vectorizer.transform(text))
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) for attachment in bug_data["attachments"]: for flag in attachment["flags"]: if not flag["name"].startswith("approval-mozilla-") or flag[ "status" ] not in ["+", "-"]: continue if flag["status"] == "+": classes[bug_id] = 1 elif flag["status"] == "-": classes[bug_id] = 0 return classes, [0, 1]
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data['id']) component = self.filter_component(bug_data) if component: classes[bug_id] = component component_counts = Counter(classes.values()).most_common() top_components = set(component for component, count in component_counts) print(f'{len(top_components)} components') for component, count in component_counts: print(f'{component}: {count}') return {bug_id: component for bug_id, component in classes.items() if component in top_components}
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) for attachment in bug_data["attachments"]: for flag in attachment["flags"]: if not flag["name"].startswith("approval-mozilla-") or flag[ "status" ] not in ["+", "-"]: continue if flag["status"] == "+": classes[bug_id] = 1 elif flag["status"] == "-": classes[bug_id] = 0 return classes
def get_labels(self): product_components = {} for bug_data in bugzilla.get_bugs(): product_components[bug_data['id']] = (bug_data['product'], bug_data['component']) def is_meaningful(product, component): return product in self.PRODUCTS and component not in ['General', 'Untriaged'] product_component_counts = Counter(((product, component) for product, component in product_components.values() if is_meaningful(product, component))).most_common() max_count = product_component_counts[0][1] threshold = max_count / 100 self.meaningful_product_components = set(product_component for product_component, count in product_component_counts if count > threshold) classes = {} for bug_id, (product, component) in product_components.items(): component = self.filter_component(product, component) if component: classes[bug_id] = component component_counts = Counter(classes.values()).most_common() top_components = set(component for component, count in component_counts) print(f'{len(top_components)} components') for component, count in component_counts: print(f'{component}: {count}') # Assert there is at least one bug for each conflated component. for conflated_component in self.CONFLATED_COMPONENTS: assert any(conflated_component == component for component, count in component_counts), f'There should be at least one bug matching {conflated_component}*' # Assert there is at least one bug for each component the conflated components are mapped to. for conflated_component_mapping in self.CONFLATED_COMPONENTS_MAPPING.values(): assert any(conflated_component_mapping == f'{product}::{component}' for product, component in product_components.values()), f'There should be at least one bug in {conflated_component_mapping}' # Assert all conflated components are either in conflated_components_mapping or exist as components. for conflated_component in self.CONFLATED_COMPONENTS: assert conflated_component in self.CONFLATED_COMPONENTS_MAPPING or \ any(conflated_component == f'{product}::{component}' for product, component in product_components.values()), f'It should be possible to map {conflated_component}' return {bug_id: component for bug_id, component in classes.items() if component in top_components}
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data['id']) for entry in bug_data['history']: for change in entry['changes']: if change['added'].startswith('qawanted'): classes[bug_id] = 1 elif 'flags' in entry: for flag in entry['flags']: if flag['name'].startswith('qe-verify'): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data['id']) for attachment in bug_data['attachments']: for flag in attachment['flags']: if not flag['name'].startswith( 'approval-mozilla-') or flag['status'] not in [ '+', '-' ]: continue if flag['status'] == '+': classes[bug_id] = 1 elif flag['status'] == '-': classes[bug_id] = 0 return classes
def get_labels(self): classes = {} for bug_id, category in labels.get_labels("tracking"): assert category in ["True", "False"], f"unexpected category {category}" classes[int(bug_id)] = 1 if category == "True" else 0 for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) flag_found = False tracking_flags = [ flag for flag in bug_data.keys() if flag.startswith("cf_tracking_firefox") ] for tracking_flag in tracking_flags: if bug_data[tracking_flag] in ["blocking", "+"]: classes[bug_id] = 1 flag_found = True elif bug_data[tracking_flag] == "-": classes[bug_id] = 0 flag_found = True if not flag_found: for entry in bug_data["history"]: for change in entry["changes"]: if change["field_name"].startswith( "cf_tracking_firefox"): if change["added"] in ["blocking", "+"]: classes[bug_id] = 1 elif change["added"] == "-": classes[bug_id] = 0 if bug_data["resolution"] in ["INVALID", "DUPLICATE"]: continue if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def classify_bugs(model_name, classifier): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) feature_names = model.get_human_readable_feature_names() model.print_feature_importances( importance["importances"], feature_names, class_probabilities=probas ) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input()
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8 ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] self.bug_ids = [] for bug in bugzilla.get_bugs(): self.corpus.append(self.text_preprocess(self.get_text(bug))) self.bug_ids.append(bug["id"]) indexes = list(range(len(self.corpus))) random.shuffle(indexes) self.corpus = [self.corpus[idx] for idx in indexes] self.bug_ids = [self.bug_ids[idx] for idx in indexes] self.model = BM25(self.corpus)
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(include_invalid=True): bug_id = bug_data["id"] # Legitimate bugs if bug_data["resolution"] == "FIXED": classes[bug_id] = 0 # Spam bugs elif (bug_data["product"] == "Invalid Bugs" and bug_data["component"] == "General"): classes[bug_id] = 1 print("{} bugs are classified as non-spam".format( sum(1 for label in classes.values() if label == 0))) print("{} bugs are classified as spam".format( sum(1 for label in classes.values() if label == 1))) return classes, [0, 1]
def get_labels(self): bug_fix_times = [] for bug in bugzilla.get_bugs(): fix_time = bug_features.get_time_to_fix(bug) if fix_time is None: continue bug_fix_times.append((bug["id"], fix_time)) def _quantiles(n): return statistics.quantiles( (fix_time for bug_id, fix_time in bug_fix_times), n=n ) quantiles = _quantiles(2) logger.info( f"Max fix time: {max(fix_time for bug_id, fix_time in bug_fix_times)}" ) logger.info(f"Fix time quantiles: {quantiles}") logger.info(f"Fix time quartiles: {_quantiles(4)}") logger.info(f"Fix time deciles: {_quantiles(10)}") classes = {} for bug_id, fix_time in bug_fix_times: for i, quantile in enumerate(quantiles): if fix_time <= quantile: classes[bug_id] = i break if bug_id not in classes: classes[bug_id] = i + 1 for i in range(len(quantiles) + 1): logger.info( f"{sum(1 for label in classes.values() if label == i)} bugs are in the {i}th quantile" ) return classes, list(range(len(quantiles) + 1))
def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [ commit for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= since and commit["bug_id"] ] bug_ids = set(commit["bug_id"] for commit in commits) bug_ids.update(bug["id"] for bug in bugzilla.get_bugs() if dateutil.parser.parse(bug["creation_time"]).replace( tzinfo=None) >= since and bug["resolution"] not in [ "INVALID", "WONTFIX", "INACTIVE", "DUPLICATE", "INCOMPLETE", "MOVED", ]) return list(bug_ids)
def get_similar_bugs(self, bug): similar_bug_ids = self.search_similar_bugs(bug) if self.duplicatemodel: similar_bugs = [ bug for bug in bugzilla.get_bugs() if bug["id"] in similar_bug_ids ] bug_couples = [(bug, similar_bug) for similar_bug in similar_bugs] probs_bug_couples = sorted( zip( self.duplicatemodel.classify(bug_couples, probabilities=True), bug_couples, ), key=lambda v: -v[0][1], ) similar_bug_ids = [ similar_bug["id"] for prob, (bug, similar_bug) in probs_bug_couples if prob[1] > self.confidence_threshold ] return similar_bug_ids
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) found_dev_doc = False if any( keyword in bug_data["keywords"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ): classes[bug_id] = 1 found_dev_doc = True if not found_dev_doc: for entry in bug_data["history"]: for change in entry["changes"]: # Bugs that get dev-doc-needed removed from them at some point after it's been added (this suggests a false positive among human-analyzed bugs) if ( change["field_name"] == "keywords" and "dev-doc-needed" in change["removed"] and "dev-doc-complete" not in change["added"] ): classes[bug_id] = 0 # Bugs that go from dev-doc-needed to dev-doc-complete are guaranteed to be good # Bugs that go from not having dev-doc-needed to having dev-doc-complete are bugs # that were missed by previous scans through content but someone realized it # should have been flagged and updated the docs, found the docs already updated. elif change["field_name"] == "keywords" and any( keyword in change["added"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def __init__(self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [ self.dictionary.doc2bow(text) for bug_id, text in self.corpus ] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, num_topics=300) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity(output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300)
def go(self, days: int) -> None: bugs = self.get_landed_and_filed_since(days) meta_bugs = self.get_blocking_of(self.get_meta_bugs(days)) bugs += meta_bugs.keys() bugs += sum(meta_bugs.values(), []) bugs = list(set(bugs)) test_infos = self.retrieve_test_info(days) test_info_bugs: List[int] = [ bug["id"] for test_info in test_infos.values() for bug in test_info["bugs"] ] logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs + test_info_bugs) logger.info(f"{len(bugs)} bugs to analyze.") bugs_set = set(bugs + test_info_bugs) bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call). if bug["id"] in bugs_set or len(bug["blocks"]) > 0: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs, meta_bugs) self.generate_component_connections(bug_map, bugs) self.generate_component_test_stats(bug_map, test_infos)
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): if "cf_has_str" in bug_data: if bug_data["cf_has_str"] == "no": classes[int(bug_data["id"])] = 0 elif bug_data["cf_has_str"] == "yes": classes[int(bug_data["id"])] = 1 elif "stepswanted" in bug_data["keywords"]: classes[int(bug_data["id"])] = 0 else: for entry in bug_data["history"]: for change in entry["changes"]: if change["removed"].startswith("stepswanted"): classes[int(bug_data["id"])] = 1 print("{} bugs have no steps to reproduce".format( sum(1 for label in classes.values() if label == 0))) print("{} bugs have steps to reproduce".format( sum(1 for label in classes.values() if label == 1))) return classes, [0, 1]
def test_rollback(): model = QANeededModel() histories = {} for bug in bugzilla.get_bugs(): histories[int(bug["id"])] = bug["history"] def rollback_point(bug_id): count = 0 for history in histories[bug_id]: for change in history["changes"]: if model.rollback(change): return count count += 1 return count assert (rollback_point(1390433) == 35 ), "A bug field should start with qawanted or qe-verify" assert (rollback_point(1389136) == 9 ), "A bug field should start with qawanted or qe-verify" assert rollback_point(1388990) == 29 assert rollback_point(1389223) == 8
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): if bug_data['assigned_to_detail']['email'] in ADDRESSES_TO_EXCLUDE: continue bug_id = int(bug_data['id']) classes[bug_id] = bug_data['assigned_to_detail']['email'] assignee_counts = Counter(classes.values()).most_common() top_assignees = set(assignee for assignee, count in assignee_counts if count > MINIMUM_ASSIGNMENTS) print(f'{len(top_assignees)} assignees') for assignee, count in assignee_counts: print(f'{assignee}: {count}') return { bug_id: assignee for bug_id, assignee in classes.items() if assignee in top_assignees }
def evaluation(self): total_r = 0 hits_r = 0 total_p = 0 hits_p = 0 for bug in bugzilla.get_bugs(): if duplicates[bug["id"]]: similar_bugs = self.get_similar_bugs(bug) # Recall for item in duplicates[bug["id"]]: total_r += 1 if item in similar_bugs: hits_r += 1 # Precision for element in similar_bugs: total_p += 1 if element in duplicates[bug["id"]]: hits_p += 1 print(f"Recall: {hits_r/total_r * 100}%") print(f"Precision: {hits_p/total_p * 100}%")
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) found_qa = False if any( keyword.startswith(label) for keyword in bug_data["keywords"] for label in ["qawanted", "qe-verify", "qaurgent"]): classes[bug_id] = 1 found_qa = True if not found_qa: for entry in bug_data["history"]: for change in entry["changes"]: if any(change["added"].startswith(label) for label in ["qawanted", "qe-verify", "qaurgent"]): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1]
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(include_invalid=True): bug_id = bug_data["id"] # Skip bugs filed by Mozillians, since we are sure they are not spam. if "@mozilla" in bug_data["creator"]: continue # Legitimate bugs if bug_data["resolution"] == "FIXED": classes[bug_id] = 0 # Spam bugs elif bug_data["product"] == "Invalid Bugs": classes[bug_id] = 1 print("{} bugs are classified as non-spam".format( sum(1 for label in classes.values() if label == 0))) print("{} bugs are classified as spam".format( sum(1 for label in classes.values() if label == 1))) return classes, [0, 1]
def main(args): if args.algorithm == "elasticsearch": model = similarity.model_name_to_class[args.algorithm]() else: model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag(URL.format(model_file_name)) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model = similarity.model_name_to_class[args.algorithm].load( f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" ) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) bugs = {} for bug in bugzilla.get_bugs(): if bug["id"] in bug_ids or bug["id"] == args.bug_id: bugs[bug["id"]] = bug print("{}: {}".format(args.bug_id, bugs[args.bug_id]["summary"])) for bug_id in bug_ids: print("{}: {}".format(bug_id, bugs[bug_id]["summary"]))
def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) if "regressionwindow-wanted" in bug_data["keywords"]: classes[bug_id] = 0 elif "cf_has_regression_range" in bug_data: if bug_data["cf_has_regression_range"] == "yes": classes[bug_id] = 1 elif bug_data["cf_has_regression_range"] == "no": classes[bug_id] = 0 print( "{} bugs have regression range".format( sum(1 for label in classes.values() if label == 1) ) ) print( "{} bugs don't have a regression range".format( sum(1 for label in classes.values() if label == 0) ) ) return classes, [0, 1]
def bugs(): return (bug for bug in bugzilla.get_bugs() if bug['id'] in classes)
def get_inconsistencies(bugs): inconsistencies = [] for bug in bugs: try: rollback(bug, do_assert=True) except Exception as e: print(bug["id"]) print(e) inconsistencies.append(bug) return inconsistencies if __name__ == "__main__": import argparse from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument("--verbose", help="Verbose mode", action="store_true") args = parser.parse_args() for bug in tqdm(bugzilla.get_bugs()): if args.verbose: print(bug["id"]) rollback(bug, do_assert=True)
elif args.goal == "regressionrange": from bugbug.models.regression import RegressionModel model = RegressionModel.load("regressionmodel") file_path = os.path.join("bugbug", "labels", f"{args.goal}.csv") with open(file_path, "r") as f: reader = csv.reader(f) next(reader) labeled_comments = [(int(r[0]), int(r[1]), r[2]) for r in reader] already_done = set((c[0], c[1]) for c in labeled_comments) bugs = [] for bug in bugzilla.get_bugs(): # For the str and regressionrange problems, we don't care about test failures, if ( "intermittent-failure" in bug["keywords"] or "stockwell" in bug["whiteboard"] or "permafail" in bug["summary"].lower() ): continue # bugs filed from Socorro, if ( "this bug was filed from the socorro interface" in bug["comments"][0]["text"].lower() ): continue
def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst" ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: assert db.download(bugzilla.BUGS_DB) bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def get_bugbug_labels(self, kind="bug"): assert kind in ["bug", "regression", "defect_enhancement_task"] classes = {} for bug_id, category in labels.get_labels("bug_nobug"): assert category in ["True", "False"], f"unexpected category {category}" if kind == "bug": classes[int(bug_id)] = 1 if category == "True" else 0 elif kind == "regression": if category == "False": classes[int(bug_id)] = 0 elif kind == "defect_enhancement_task": if category == "True": classes[int(bug_id)] = "defect" for bug_id, category in labels.get_labels("regression_bug_nobug"): assert category in [ "nobug", "bug_unknown_regression", "bug_no_regression", "regression", ], f"unexpected category {category}" if kind == "bug": classes[int(bug_id)] = 1 if category != "nobug" else 0 elif kind == "regression": if category == "bug_unknown_regression": continue classes[int(bug_id)] = 1 if category == "regression" else 0 elif kind == "defect_enhancement_task": if category != "nobug": classes[int(bug_id)] = "defect" defect_enhancement_task_e = { bug_id: category for bug_id, category in labels.get_labels("defect_enhancement_task_e") } defect_enhancement_task_p = { bug_id: category for bug_id, category in labels.get_labels("defect_enhancement_task_p") } defect_enhancement_task_s = { bug_id: category for bug_id, category in labels.get_labels("defect_enhancement_task_s") } defect_enhancement_task_h = { bug_id: category for bug_id, category in labels.get_labels("defect_enhancement_task_h") } defect_enhancement_task_common = ( (bug_id, category) for bug_id, category in defect_enhancement_task_p.items() if ( bug_id not in defect_enhancement_task_e or defect_enhancement_task_e[bug_id] == defect_enhancement_task_p[bug_id] ) and ( bug_id not in defect_enhancement_task_s or defect_enhancement_task_s[bug_id] == defect_enhancement_task_p[bug_id] ) and ( bug_id not in defect_enhancement_task_h or defect_enhancement_task_h[bug_id] == defect_enhancement_task_p[bug_id] ) ) for bug_id, category in itertools.chain( labels.get_labels("defect_enhancement_task"), defect_enhancement_task_common ): assert category in ["defect", "enhancement", "task"] if kind == "bug": classes[int(bug_id)] = 1 if category == "defect" else 0 elif kind == "regression": if category in ["enhancement", "task"]: classes[int(bug_id)] = 0 elif kind == "defect_enhancement_task": classes[int(bug_id)] = category # Augment labes by using bugs marked as 'regression' or 'feature', as they are basically labelled. # And also use the new bug type field. bug_ids = set() for bug in bugzilla.get_bugs(): bug_id = int(bug["id"]) bug_ids.add(bug_id) if bug_id in classes: continue if any( keyword in bug["keywords"] for keyword in ["regression", "talos-regression"] ) or ( "cf_has_regression_range" in bug and bug["cf_has_regression_range"] == "yes" ): if kind in ["bug", "regression"]: classes[bug_id] = 1 else: classes[bug_id] = "defect" elif any(keyword in bug["keywords"] for keyword in ["feature"]): if kind in ["bug", "regression"]: classes[bug_id] = 0 else: classes[bug_id] = "enhancement" elif kind == "regression": for history in bug["history"]: for change in history["changes"]: if change["field_name"] == "keywords": if "regression" in change["removed"].split(","): classes[bug_id] = 0 elif "regression" in change["added"].split(","): classes[bug_id] = 1 # The conditions to use the 'defect' type are more restricted. can_use_type = False can_use_defect_type = False # We can use the type as a label for all bugs after the migration (https://bugzilla.mozilla.org/show_bug.cgi?id=1524738), if they are not defects. if bug["id"] > 1_540_807: can_use_type = True # And we can use the type as a label for bugs whose type has been modified. # For 'defects', we can't use them as labels unless resulting from a change, because bugs are filed by default as 'defect' and so they could be mistakes. if not can_use_type or bug["type"] == "defect": for history in bug["history"]: for change in history["changes"]: if change["field_name"] == "type": can_use_type = can_use_defect_type = True if can_use_type: if bug["type"] == "enhancement": if kind == "bug": classes[int(bug_id)] = 0 elif kind == "regression": classes[int(bug_id)] = 0 elif kind == "defect_enhancement_task": classes[int(bug_id)] = "enhancement" elif bug["type"] == "task": if kind == "bug": classes[int(bug_id)] = 0 elif kind == "regression": classes[int(bug_id)] = 0 elif kind == "defect_enhancement_task": classes[int(bug_id)] = "task" elif bug["type"] == "defect" and can_use_defect_type: if kind == "bug": classes[int(bug_id)] = 1 elif kind == "defect_enhancement_task": classes[int(bug_id)] = "defect" # Remove labels which belong to bugs for which we have no data. return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
def evaluation(self): # A map from bug ID to its duplicate IDs duplicates = defaultdict(set) all_ids = set( bug["id"] for bug in bugzilla.get_bugs() if bug["creator"] not in REPORTERS_TO_IGNORE and "dupeme" not in bug["keywords"] ) for bug in bugzilla.get_bugs(): dupes = [entry for entry in bug["duplicates"] if entry in all_ids] if bug["dupe_of"] in all_ids: dupes.append(bug["dupe_of"]) duplicates[bug["id"]].update(dupes) for dupe in dupes: duplicates[dupe].add(bug["id"]) total_r = 0 hits_r = 0 total_p = 0 hits_p = 0 recall_rate_1 = 0 recall_rate_5 = 0 recall_rate_10 = 0 precision_rate_1 = 0 precision_rate_5 = 0 precision_rate_10 = 0 queries = 0 apk = [] for bug in tqdm(bugzilla.get_bugs()): if duplicates[bug["id"]]: score = 0 num_hits = 0 queries += 1 similar_bugs = self.get_similar_bugs(bug)[:10] # Recall for idx, item in enumerate(duplicates[bug["id"]]): total_r += 1 if item in similar_bugs: hits_r += 1 if idx == 0: recall_rate_1 += 1 if idx < 5: recall_rate_5 += 1 if idx < 10: recall_rate_10 += 1 # Precision for idx, element in enumerate(similar_bugs): total_p += 1 if element in duplicates[bug["id"]]: hits_p += 1 if idx == 0: precision_rate_1 += 1 if idx < 5: precision_rate_5 += 1 / 5 if idx < 10: precision_rate_10 += 1 / 10 num_hits += 1 score += num_hits / (idx + 1) apk.append(score / min(len(duplicates[bug["id"]]), 10)) print(f"Recall @ 1: {recall_rate_1/total_r * 100}%") print(f"Recall @ 5: {recall_rate_5/total_r * 100}%") print(f"Recall @ 10: {recall_rate_10/total_r * 100}%") print(f"Precision @ 1: {precision_rate_1/queries * 100}%") print(f"Precision @ 5: {precision_rate_5/queries * 100}%") print(f"Precision @ 10: {precision_rate_10/queries * 100}%") print(f"Recall: {hits_r/total_r * 100}%") print(f"Precision: {hits_p/total_p * 100}%") print(f"MAP@k : {np.mean(apk) * 100}%")
assert len(bug['comments']) >= 1 return bug def get_inconsistencies(find_all=False): inconsistencies = [] for bug in bugzilla.get_bugs(): try: rollback(bug, None, False, find_all) except Exception as e: print(bug['id']) print(e) inconsistencies.append(bug['id']) return inconsistencies if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--verbose', help='Verbose mode', action='store_true') args = parser.parse_args() for i, bug in enumerate(bugzilla.get_bugs()): if args.verbose: print(bug['id']) print(i) rollback(bug, None, False)
return bug def get_inconsistencies(find_all=False): inconsistencies = [] for bug in bugzilla.get_bugs(): try: rollback(bug, None, False, find_all) except Exception as e: print(bug["id"]) print(e) inconsistencies.append(bug["id"]) return inconsistencies if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--verbose", help="Verbose mode", action="store_true") args = parser.parse_args() for i, bug in enumerate(bugzilla.get_bugs()): if args.verbose: print(bug["id"]) print(i) rollback(bug, None, False)
def go(self, bugs: List[int], meta_bugs: Optional[List[int]] = None) -> None: if meta_bugs is not None: bugs += meta_bugs + self.get_blocking_of(meta_bugs) logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs) component_team_mapping = bugzilla.get_component_team_mapping() bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] commit_map = {commit["node"]: commit for commit in commits} hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)} logger.info(f"{len(commits)} commits to analyze.") logger.info(f"{len(bugs_set)} bugs to analyze.") bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } if meta_bugs is not None: blocker_to_meta = collections.defaultdict(set) for meta_bug in meta_bugs: if meta_bug not in bug_map: continue for blocker_bug_id in bugzilla.find_blocking( bug_map, bug_map[meta_bug]): blocker_to_meta[blocker_bug_id].add(meta_bug) def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f) past_regressions_by = {} past_fixed_bugs_by = {} past_regression_blocked_bugs_by = {} past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) past_regression_blocked_bugs_by[dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension)) path_to_component = repository.get_component_mapping() def get_full_component(bug): return "{}::{}".format(bug["product"], bug["component"]) def histogram(components: List[str]) -> Dict[str, float]: counter = collections.Counter(components) return { component: count / len(components) for component, count in counter.most_common() } def component_histogram(bugs: List[dict]) -> Dict[str, float]: return histogram([bug["component"] for bug in bugs]) def find_risk_band(risk: float) -> str: for name, start, end in self.risk_bands: if start <= risk <= end: return name assert False def get_prev_bugs(past_bugs_by: dict, commit: repository.CommitDict, component: str = None) -> List[dict]: paths = [ path for path in commit["files"] if component is None or (path.encode( "utf-8") in path_to_component and path_to_component[ path.encode("utf-8")] == component.encode("utf-8")) ] past_bugs = [] for path, f_group in commit["functions"].items(): if path not in paths: continue if path not in past_bugs_by["function"]: continue found = False for f in f_group: if f[0] not in past_bugs_by["function"][path]: continue found = True past_bugs += past_bugs_by["function"][path][f[0]] if found: paths.remove(path) for path in paths: if path in past_bugs_by["file"]: past_bugs += past_bugs_by["file"][path] paths.remove(path) for path, directories in zip(paths, repository.get_directories(paths)): found = False for directory in directories: if directory in past_bugs_by["directory"]: found = True past_bugs += past_bugs_by["directory"][directory] if found: paths.remove(path) components = [ path_to_component[path.encode("utf-8")].tobytes().decode( "utf-8") for path in paths if path.encode("utf-8") in path_to_component ] for component in components: if component in past_bugs_by["component"]: past_bugs += past_bugs_by["component"][component] return past_bugs def get_prev_bugs_stats( commit_group: dict, commit_list: List[repository.CommitDict], component: str = None, ) -> None: # Find previous regressions occurred in the same files as those touched by these commits. # And find previous bugs that were fixed by touching the same files as these commits. # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits. # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits. prev_regressions: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regressions_by, commit, component) for commit in commit_list), [], ) prev_fixed_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bugs_by, commit, component) for commit in commit_list), [], ) prev_regression_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regression_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bug_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_regressions = _deduplicate(prev_regressions) prev_fixed_bugs = _deduplicate(prev_fixed_bugs) prev_regression_blocked_bugs = _deduplicate( prev_regression_blocked_bugs) prev_fixed_bug_blocked_bugs = _deduplicate( prev_fixed_bug_blocked_bugs) regression_components = component_histogram(prev_regressions) fixed_bugs_components = component_histogram(prev_fixed_bugs) regression_blocked_bug_components = component_histogram( prev_regression_blocked_bugs) fixed_bug_blocked_bug_components = component_histogram( prev_fixed_bug_blocked_bugs) commit_group[ "most_common_regression_components"] = regression_components # These are only used for component connections for the time being. if component: commit_group["prev_regressions"] = prev_regressions[-3:] commit_group["prev_fixed_bugs"] = prev_fixed_bugs[-3:] commit_group[ "prev_regression_blocked_bugs"] = prev_regression_blocked_bugs[ -3:] commit_group[ "prev_fixed_bug_blocked_bugs"] = prev_fixed_bug_blocked_bugs[ -3:] commit_group[ "most_common_fixed_bugs_components"] = fixed_bugs_components commit_group[ "most_common_regression_blocked_bug_components"] = regression_blocked_bug_components commit_group[ "most_common_fixed_bug_blocked_bug_components"] = fixed_bug_blocked_bug_components def get_commit_data( commit_list: List[repository.CommitDict]) -> List[dict]: if len(commit_list) == 0: return [] # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project( revision_map[revision_id]) if testing is None: testing = "missing" else: testing = None commits_data.append({ "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "author": commit["author_email"], "reviewers": commit["reviewers"], "coverage": [ commit["cov_added"], commit["cov_covered"], commit["cov_unknown"], ], }) return commits_data # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID. commits.sort(key=lambda x: x["bug_id"]) bug_to_commits = {} for bug_id, commit_iter in itertools.groupby(commits, lambda x: x["bug_id"]): # TODO: Figure out what to do with bugs we couldn't download (security bugs). if bug_id not in bug_map: continue bug_to_commits[bug_id] = sorted( commit_iter, key=lambda x: hash_to_rev[x["node"]]) bug_summaries = [] for bug_id in bugs: if bug_id not in bug_map: continue commit_list = bug_to_commits.get(bug_id, []) commit_data = get_commit_data(commit_list) bug = bug_map[bug_id] bug_summary = { "id": bug_id, "regressor": bug_id in regressor_bug_ids, "regression": len(bug["regressed_by"]) > 0 or any(keyword in bug["keywords"] for keyword in ["regression", "talos-regression"]) or ("cf_has_regression_range" in bug and bug["cf_has_regression_range"] == "yes"), "whiteboard": bug["whiteboard"], "assignee": bug["assigned_to"] if bug["assigned_to"] != "*****@*****.**" else None, "versions": bugzilla.get_fixed_versions(bug), "component": get_full_component(bug), "team": bugzilla.component_to_team(component_team_mapping, bug["product"], bug["component"]), "summary": bug["summary"], "types": bug_to_types(bug), "severity": bug["severity"], "creation_date": dateutil.parser.parse( bug["creation_time"]).strftime("%Y-%m-%d"), "date": max( dateutil.parser.parse(commit["pushdate"]) for commit in commit_list).strftime("%Y-%m-%d") if len(commit_list) > 0 else None, "commits": commit_data, "meta_ids": list(blocker_to_meta[bug_id]), "risk_band": find_risk_band(max(commit["risk"] for commit in commit_data)) if len(commit_data) > 0 else None, } get_prev_bugs_stats(bug_summary, commit_list) bug_summaries.append(bug_summary) landings_by_date = collections.defaultdict(list) for bug_summary in bug_summaries: landings_by_date[bug_summary["creation_date"]].append(bug_summary) with open("landings_by_date.json", "w") as f: output: dict = { "summaries": landings_by_date, } if meta_bugs is not None: output["featureMetaBugs"] = [{ "id": meta_bug, "summary": bug_map[meta_bug]["summary"] } for meta_bug in meta_bugs] json.dump(output, f) # Retrieve components of test failures that occurred when landing patches to fix bugs in specific components. component_failures = collections.defaultdict(list) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( "group") for revisions, _, _, possible_regressions, likely_regressions in tqdm( push_data_iter(), total=push_data_count): commit_list = [ commit_map[revision] for revision in revisions if revision in commit_map ] if len(commit_list) == 0: continue commit_bugs = [ bug_map[commit["bug_id"]] for commit in commit_list if commit["bug_id"] in bug_map ] components = list( set(get_full_component(bug) for bug in commit_bugs)) groups = [ group for group in list( set(possible_regressions + likely_regressions)) if group.encode("utf-8") in path_to_component ] for group in groups: for component in components: component_failures[component].append(path_to_component[ group.encode("utf-8")].tobytes().decode("utf-8")) # Filter out commits for which we have no bugs. commits = [commit for commit in commits if commit["bug_id"] in bug_map] # Sort commits by bug component, so we can use itertools.groupby to group them by bug component. commits.sort(key=lambda x: get_full_component(bug_map[x["bug_id"]])) commit_groups = [] for component, commit_iter in itertools.groupby( commits, lambda x: get_full_component(bug_map[x["bug_id"]])): commit_group = { "component": component, "most_common_test_failure_components": histogram(component_failures[component]) if component in component_failures else {}, } get_prev_bugs_stats(commit_group, list(commit_iter), component) commit_groups.append(commit_group) with open("component_connections.json", "w") as f: json.dump(commit_groups, f) repository.close_component_mapping()
def test_bug_snapshot(): for i, bug in enumerate(bugzilla.get_bugs()): print(bug["id"]) print(i) rollback(bug, None, False)