def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [] last_commit_by_bug: Dict[int, datetime] = {} for commit in repository.get_commits(): if not commit["bug_id"]: continue push_date = dateutil.parser.parse(commit["pushdate"]) if push_date >= since and ( commit["bug_id"] not in last_commit_by_bug or push_date - last_commit_by_bug[commit["bug_id"]] < timedelta(days=91) or not all(repository.is_test(p) for p in commit["files"])): commits.append(commit) last_commit_by_bug[commit["bug_id"]] = push_date logger.info(f"Retrieving bug IDs since {days} days ago") timespan_ids = bugzilla.get_ids_between(since, resolution=["---", "FIXED"]) return list( set(commit["bug_id"] for commit in commits) | set(timespan_ids))
def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [ commit for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= since and commit["bug_id"] ] logger.info(f"Retrieving bug IDs since {days} days ago") timespan_ids = bugzilla.get_ids_between(since, datetime.utcnow()) bugzilla.download_bugs(timespan_ids) bug_ids = set(commit["bug_id"] for commit in commits) bug_ids.update(bug["id"] for bug in bugzilla.get_bugs() if dateutil.parser.parse(bug["creation_time"]).replace( tzinfo=None) >= since and bug["resolution"] not in [ "INVALID", "WONTFIX", "INACTIVE", "DUPLICATE", "INCOMPLETE", "MOVED", "WORKSFORME", ]) return list(bug_ids)
def main(args): similarity_model = ( similarity.download_and_load_similarity_model(args.similaritymodel) if args.similaritymodel else None ) duplicate_model = DuplicateModel.load("duplicatemodel") try: with open("duplicate_test_bugs.json", "r") as f: test_bugs = json.load(f) except FileNotFoundError: test_bug_ids = bugzilla.get_ids_between( datetime.now() - timedelta(days=21), datetime.now() ) test_bugs = bugzilla.get(test_bug_ids) test_bugs = [ bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE ] with open("duplicate_test_bugs.json", "w") as f: json.dump(test_bugs, f) with open("duplicate_predictions.csv", "w") as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow( ["bug 1 ID", "bug 1 summary", "bug 2 ID", "bug 2 summary", "prediction"] ) if similarity_model: bug_tuples = [] for test_bug in test_bugs: similar_bug_ids = similarity_model.get_similar_bugs(test_bug) similar_bugs = bugzilla.get(similar_bug_ids) bug_tuples += [ (test_bug, similar_bug) for similar_bug in similar_bugs.values() ] else: bug_tuples = combinations(test_bugs, 2) probs = duplicate_model.classify(bug_tuples, probabilities=True) for bug_tuple, prob in zip(bug_tuples, probs): if prob[1] > similarity_model.confidence_threshold: spamwriter.writerow( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[0]["id"]}', bug_tuple[0]["summary"], f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[1]["id"]}', bug_tuple[1]["summary"], prob[1], ] )
def generate_sheet(model_name, token, days, threshold): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() start_date = today - timedelta(days) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(start_date, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) probability = p[0] if len(probability) > 2: index = np.argmax(probability) prediction = model.class_names[index] else: prediction = "y" if probability[1] >= threshold else "n" rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', prediction, "", bug["summary"], ] ) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
def fetch_untriaged(args): from bugbug import bugzilla # Set bugzilla token and download bugs bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(date.today() - timedelta(days=args.days_back)) bugs = bugzilla.get(bug_ids) # Get untriaged bugs untriaged_bugs = [] for bug in bugs.values(): for history in bug["history"]: for change in history["changes"]: if (change["field_name"] == "component" and change["removed"] == "Untriaged"): untriaged_bugs.append(bug) with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f: json.dump(untriaged_bugs, f) return untriaged_bugs
def generate_sheet(model_name, token): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
from datetime import datetime, timedelta from bugbug import bugzilla from bugbug.models.duplicate import DuplicateModel m = DuplicateModel.load("duplicatemodel") REPORTERS_TO_IGNORE = {"*****@*****.**", "*****@*****.**"} try: with open("duplicate_test_bugs.json", "r") as f: test_bugs = json.load(f) except FileNotFoundError: test_bug_ids = bugzilla.get_ids_between( datetime.now() - timedelta(days=21), datetime.now() ) test_bugs = bugzilla.get(test_bug_ids) test_bugs = [ bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE ] with open("duplicate_test_bugs.json", "w") as f: json.dump(test_bugs, f) bug_tuples = list(itertools.combinations(test_bugs, 2)) # Warning: Classifying all the test bugs takes a while probs = m.classify(bug_tuples, probabilities=True) with open("duplicate_predictions.csv", "w") as csvfile: spamwriter = csv.writer(csvfile)
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids( {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()} ) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between( two_years_and_six_months_ago, six_months_ago ) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = ( timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids ) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set ) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def main(args): model_file_name = "{}{}model".format( args.goal, "" if args.classifier == "default" else args.classifier ) if args.goal == "component": if args.classifier == "default": model_class_name = "component" else: model_class_name = "component_nn" else: model_class_name = args.goal model_class = get_model_class(model_class_name) if args.train: db.download(bugzilla.BUGS_DB) db.download(repository.COMMITS_DB) historical_supported_tasks = [ "defect", "bugtype", "defectenhancementtask", "regression", ] if args.goal in historical_supported_tasks: model = model_class(args.lemmatization, args.historical) elif args.goal == "duplicate": model = model_class(args.training_set_size, args.lemmatization) else: model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) feature_names = model.get_human_readable_feature_names() for i, (importance, index, is_positive) in enumerate( importance["importances"] ): print( f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})' ) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert ( args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ] ) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)