def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.priority(), bug_features.bug_reporter() ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(stop_words='english'), 'title'), ('comments', self.text_vectorizer(stop_words='english'), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {'regression', 'talos-regression', 'feature'}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(stop_words='english'), 'title'), ('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'), ('comments', self.text_vectorizer(stop_words='english'), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.required_dbs = [ repository.COMMITS_DB, test_scheduling.TEST_SCHEDULING_DB ] self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.name(), test_scheduling_features.platform(), test_scheduling_features.chunk(), test_scheduling_features.suite(), test_scheduling_features.prev_failures(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(min_df=0.0001), 'title'), ('comments', self.text_vectorizer(min_df=0.0001), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def go(self, model_name: str) -> None: # Load the model model = Model.load(download_model(model_name)) # Then call the check method of the model success = model.check() if not success: msg = f"Check of model {model.__class__!r} failed, check the output for reasons why" logger.warning(msg) sys.exit(1)
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({'qawanted'}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(), 'title'), ('comments', self.text_vectorizer(), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_hex, bug_features.cleanup_dll, bug_features.cleanup_synonyms, bug_features.cleanup_crash, ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, granularity="label", failures_skip=None): Model.__init__(self, lemmatization) self.granularity = granularity self.failures_skip = failures_skip self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) elif granularity == "config_group": self.training_dbs.append( test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity in ("group", "config_group"): feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords({"regression", "talos-regression", "feature"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
MODELS_NAMES = [ "defectenhancementtask", "component", "regression", "stepstoreproduce", "spambug", "testlabelselect", "testgroupselect", ] DEFAULT_EXPIRATION_TTL = 7 * 24 * 3600 # A week redis = Redis.from_url(os.environ.get("REDIS_URL", "redis://localhost/0")) MODEL_CACHE: ReadthroughTTLCache[str, Model] = ReadthroughTTLCache( timedelta(hours=1), lambda m: Model.load(f"{m}model") ) MODEL_CACHE.start_ttl_thread() cctx = zstandard.ZstdCompressor(level=10) def setkey(key: str, value: bytes, compress: bool = False) -> None: LOGGER.debug(f"Storing data at {key}: {value!r}") if compress: value = cctx.compress(value) redis.set(key, value) redis.expire(key, DEFAULT_EXPIRATION_TTL) def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -> str:
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def __init__(self, lemmatization=False, granularity="label", use_subset=False): Model.__init__(self, lemmatization) self.granularity = granularity # This is useful for development purposes, it avoids using too much memory # by using a subset of the dataset (dropping some passing runnables). self.use_subset = use_subset self.required_dbs = [repository.COMMITS_DB] if granularity == "label": self.required_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) elif granularity == "group": self.required_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.name(), test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")