def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(KEYWORD_DICT.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {'regression', 'talos-regression', 'feature'}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(min_df=0.001), 'title'), ('first_comment', self.text_vectorizer(min_df=0.001), 'first_comment'), ('comments', self.text_vectorizer(min_df=0.001), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False, historical=False, rca_subcategories_enabled=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.rca_subcategories_enabled = rca_subcategories_enabled # should we consider only the main category or all sub categories self.RCA_TYPES = (RCA_SUBCATEGORIES + RCA_CATEGORIES if rca_subcategories_enabled else RCA_CATEGORIES) self.RCA_LIST = sorted(set(self.RCA_TYPES)) feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), # Ignore whiteboards that would make the ML completely skewed # bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {"regression", "talos-regression", "feature"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] if historical: feature_extractors.append(bug_features.had_severity_enhancement()) cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(keyword_dict.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ] ), ), ] ) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))