def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.whiteboard(), bug_features.product(), # TODO: We would like to use the component at the time of filing too, # but we can't because the rollback script doesn't support changes to # components yet. # bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), bug_features.priority(), bug_features.version(), bug_features.target_milestone(), bug_features.has_attachment(), bug_features.platform(), bug_features.op_sys(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False self.params = [ { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 256, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.5, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 256, "long_desc_encoded_dropout": 0.5, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 25, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 50, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 100, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.5, "tfidf_word_dense_units": 600, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 500, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 2000, "x_dropout_rate": 0.6, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 250, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 250, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.45, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 30, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.4, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 55, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.4, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 110, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 610, "tfidf_word_dropout_rate": 0.45, "tfidf_char_inp_dense_unit": 510, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 1970, "x_dropout_rate": 0.5, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 266, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 266, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 35, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 60, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 120, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 620, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 520, "tfidf_char_inp_dropout_rate": 0.45, "x_dense_unit": 1950, "x_dropout_rate": 0.5, }, ] feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys(), ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", StructuredColumnTransformer([ ( "platform", make_pipeline(DictExtractor("platform"), OrdinalEncoder()), "data", ), ( "op_sys", make_pipeline(DictExtractor("op_sys"), OrdinalEncoder()), "data", ), ( "bug_reporter", make_pipeline( DictExtractor("bug_reporter"), MissingOrdinalEncoder(), ), "data", ), ( "title_sequence", KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), "title", ), ( "first_comment_sequence", KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), "first_comment", ), ( "title_char_tfidf", TfidfVectorizer( strip_accents="unicode", analyzer="char", stop_words="english", ngram_range=(2, 4), max_features=25000, sublinear_tf=True, ), "title", ), ( "title_word_tfidf", TfidfVectorizer( strip_accents="unicode", min_df=0.0001, max_df=0.1, analyzer="word", token_pattern=r"\w{1,}", stop_words="english", ngram_range=(2, 4), max_features=30000, sublinear_tf=True, ), "title", ), ]), ), ]) kwargs = { "short_desc_maxlen": self.short_desc_maxlen, "short_desc_vocab_size": self.short_desc_vocab_size, "short_desc_emb_sz": self.short_desc_emb_sz, "long_desc_maxlen": self.long_desc_maxlen, "long_desc_vocab_size": self.long_desc_vocab_size, "long_desc_emb_sz": self.long_desc_emb_sz, } estimators = [] for i, params in enumerate(self.params): kwargs["params"] = params estimator = ComponentNNClassifier(**kwargs) estimators.append(("model_{}".format(i), estimator)) self.clf = VotingClassifier(estimators=estimators, voting="soft", weights=[1, 1, 1])
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False self.params = [{ 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 256, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.5, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 256, 'long_desc_encoded_dropout': 0.5, 'long_desc_encoded_recurrent_dropout': 0.55, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 25, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.45, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 50, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.45, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 100, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.5, 'tfidf_word_dense_units': 600, 'tfidf_word_dropout_rate': 0.5, 'tfidf_char_inp_dense_unit': 500, 'tfidf_char_inp_dropout_rate': 0.5, 'x_dense_unit': 2000, 'x_dropout_rate': 0.6, }, { 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 250, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.45, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 250, 'long_desc_encoded_dropout': 0.45, 'long_desc_encoded_recurrent_dropout': 0.45, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 30, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.4, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 55, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.4, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 110, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.45, 'tfidf_word_dense_units': 610, 'tfidf_word_dropout_rate': 0.45, 'tfidf_char_inp_dense_unit': 510, 'tfidf_char_inp_dropout_rate': 0.5, 'x_dense_unit': 1970, 'x_dropout_rate': 0.5, }, { 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 266, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.45, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 266, 'long_desc_encoded_dropout': 0.45, 'long_desc_encoded_recurrent_dropout': 0.55, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 35, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.45, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 60, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.45, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 120, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.45, 'tfidf_word_dense_units': 620, 'tfidf_word_dropout_rate': 0.5, 'tfidf_char_inp_dense_unit': 520, 'tfidf_char_inp_dropout_rate': 0.45, 'x_dense_unit': 1950, 'x_dropout_rate': 0.5, }] feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys() ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', StructuredColumnTransformer([ ('platform', make_pipeline(DictExtractor('platform'), OrdinalEncoder()), 'data'), ('op_sys', make_pipeline(DictExtractor('op_sys'), OrdinalEncoder()), 'data'), ('bug_reporter', make_pipeline(DictExtractor('bug_reporter'), MissingOrdinalEncoder()), 'data'), ('title_sequence', KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), 'title'), ('first_comment_sequence', KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), 'first_comment'), ('title_char_tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 4), max_features=25000, sublinear_tf=True), 'title'), ('title_word_tfidf', TfidfVectorizer(strip_accents='unicode', min_df=0.0001, max_df=0.1, analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(2, 4), max_features=30000, sublinear_tf=True), 'title') ])), ]) kwargs = { 'short_desc_maxlen': self.short_desc_maxlen, 'short_desc_vocab_size': self.short_desc_vocab_size, 'short_desc_emb_sz': self.short_desc_emb_sz, 'long_desc_maxlen': self.long_desc_maxlen, 'long_desc_vocab_size': self.long_desc_vocab_size, 'long_desc_emb_sz': self.long_desc_emb_sz } estimators = [] for i, params in enumerate(self.params): kwargs['params'] = params estimator = ComponentNNClassifier(**kwargs) estimators.append(('model_{}'.format(i), estimator)) self.clf = VotingClassifier(estimators=estimators, voting='soft', weights=[1, 1, 1])
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys() ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', StructuredColumnTransformer([ ('platform', make_pipeline(DictExtractor('platform'), OrdinalEncoder()), 'data'), ('op_sys', make_pipeline(DictExtractor('op_sys'), OrdinalEncoder()), 'data'), ('bug_reporter', make_pipeline(DictExtractor('bug_reporter'), OrdinalEncoder()), 'data'), ('title_sequence', KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), 'title'), ('first_comment_sequence', KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), 'first_comment'), ('title_char_tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 4), max_features=25000, sublinear_tf=True), 'title'), ('title_word_tfidf', TfidfVectorizer(strip_accents='unicode', min_df=0.0001, max_df=0.1, analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(2, 4), max_features=30000, sublinear_tf=True), 'title') ])), ]) kwargs = { 'short_desc_maxlen': self.short_desc_maxlen, 'short_desc_vocab_size': self.short_desc_vocab_size, 'short_desc_emb_sz': self.short_desc_emb_sz, 'long_desc_maxlen': self.long_desc_maxlen, 'long_desc_vocab_size': self.long_desc_vocab_size, 'long_desc_emb_sz': self.long_desc_emb_sz } self.clf = ComponentNNClassifier(**kwargs)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False self.params = [ { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 256, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.5, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 256, "long_desc_encoded_dropout": 0.5, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 25, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 50, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 100, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.5, "tfidf_word_dense_units": 600, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 500, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 2000, "x_dropout_rate": 0.6, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 250, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 250, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.45, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 30, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.4, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 55, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.4, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 110, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 610, "tfidf_word_dropout_rate": 0.45, "tfidf_char_inp_dense_unit": 510, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 1970, "x_dropout_rate": 0.5, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 266, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 266, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 35, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 60, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 120, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 620, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 520, "tfidf_char_inp_dropout_rate": 0.45, "x_dense_unit": 1950, "x_dropout_rate": 0.5, }, ] feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys(), ] cleanup_functions = [] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", StructuredColumnTransformer( [ ( "platform", make_pipeline( DictExtractor("platform"), OrdinalEncoder() ), "data", ), ( "op_sys", make_pipeline( DictExtractor("op_sys"), OrdinalEncoder() ), "data", ), ( "bug_reporter", make_pipeline( DictExtractor("bug_reporter"), MissingOrdinalEncoder(), ), "data", ), ( "title_sequence", KerasTextToSequences( self.short_desc_maxlen, self.short_desc_vocab_size ), "title", ), ( "first_comment_sequence", KerasTextToSequences( self.long_desc_maxlen, self.long_desc_vocab_size ), "first_comment", ), ( "title_char_tfidf", TfidfVectorizer( strip_accents="unicode", analyzer="char", stop_words="english", ngram_range=(2, 4), max_features=25000, sublinear_tf=True, ), "title", ), ( "title_word_tfidf", TfidfVectorizer( strip_accents="unicode", min_df=0.0001, max_df=0.1, analyzer="word", token_pattern=r"\w{1,}", stop_words="english", ngram_range=(2, 4), max_features=30000, sublinear_tf=True, ), "title", ), ] ), ), ] ) kwargs = { "short_desc_maxlen": self.short_desc_maxlen, "short_desc_vocab_size": self.short_desc_vocab_size, "short_desc_emb_sz": self.short_desc_emb_sz, "long_desc_maxlen": self.long_desc_maxlen, "long_desc_vocab_size": self.long_desc_vocab_size, "long_desc_emb_sz": self.long_desc_emb_sz, } estimators = [] for i, params in enumerate(self.params): kwargs["params"] = params estimator = ComponentNNClassifier(**kwargs) estimators.append(("model_{}".format(i), estimator)) self.clf = VotingClassifier( estimators=estimators, voting="soft", weights=[1, 1, 1] )