def __init__(self, lemmatization=False): BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ] ) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.is_same_product(), bug_features.is_same_component(), bug_features.is_same_platform(), bug_features.is_same_version(), bug_features.is_same_os(), bug_features.is_same_target_milestone(), bug_features.is_first_affected_same(), bug_features.couple_common_words_comments(), bug_features.couple_delta_creation_date(), bug_features.couple_common_keywords(), bug_features.couple_common_whiteboard_keywords(), bug_features.couple_common_words_summary(), ] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("text", self.text_vectorizer(), "text"), ("couple_data", DictVectorizer(), "couple_data"), ]), ), ]) self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def __init__(self, cleanup_urls=True): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url())
def __init__(self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold
def test_responses(): tests = [ ( "A response can be of the form>This is the comment\n", "A response can be of the form \n", ), ( "Multiline responses can be>This is line 1\n>This is line2\n end of response", "Multiline responses can be \n \n end of response", ), ("Code snippet example is > + import bugbug\n", "Code snippet example is \n"), ( "Random responses >this is line1\n>this is line2\n>this is the final line", "Random responses \n \n ", ), ] for orig_text, cleaned_text in tests: assert feature_cleanup.responses()(orig_text) == cleaned_text
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [bug_features.is_same_product()] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ]) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8, end_to_end=False, ): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold self.duplicatemodel = (DuplicateModel.load("duplicatemodel") if end_to_end else None)
import nltk from gensim import models, similarities from gensim.corpora import Dictionary from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer except ImportError: raise ImportError(OPT_MSG_MISSING) nltk.download("stopwords") REPORTERS_TO_IGNORE = { "*****@*****.**", "*****@*****.**" } cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] # A map from bug ID to its duplicate IDs duplicates = defaultdict(set) all_ids = set(bug["id"] for bug in bugzilla.get_bugs() if bug["creator"] not in REPORTERS_TO_IGNORE and "dupeme" not in bug["keywords"]) for bug in bugzilla.get_bugs():