def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict( svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't use the same stage name twice assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X)
def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([("cls", LinearRegression())]) with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake="nope") # nested model check with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake__estimator="nope")
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't use the same stage name twice assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([('cls', LinearRegression())]) with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake='nope') # nested model check with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake__estimator='nope')
def train_imbalance( descr_series: Series, classes_codes: Series, TFIDF_, IMB_, FS_, req_percentage: int, CLF_, model_name: str, ) -> tuple: """Trains models using handled setting and saves them as .sav objects. Parameters: ---------- instance: Instance of User model. descr_series: description series. classes_codes: series with classes' codes. TFIDF_: vectorizer. IMB_: SMOTE instance. FS_: ranking terms method. req_percentage: percentage to be taken from the ranked list. CLF_: classifier. model_name: models name. Returns: ---------- Trained model in byte representation associated to its model name. """ transformer = feature_selection.SelectPercentile(FS_) clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_), ("fs", transformer), ("clf", CLF_)]) best_params = get_best_params(clf_model, descr_series, classes_codes) print(f"{model_name}:{best_params}") clf_model.set_params( fs__percentile=req_percentage, clf__C=best_params["clf__C"], clf__gamma=best_params["clf__gamma"], ).fit(descr_series, classes_codes) return {model_name: clf_model}, {model_name: best_params}
def get_text_transformer(method: str, pca: bool, ngram_range): from imblearn.pipeline import Pipeline from sklearn.decomposition import TruncatedSVD steps = [('vect', CountVectorizer(tokenizer=TextTokenizer().preprocess)), ('tfidf', TfidfTransformer())] if method != "none": steps.append(("os", get_balancing_step(method))) if pca: steps.append(('pca', TruncatedSVD())) pipeline = Pipeline(steps) pipeline.set_params(vect__ngram_range=ngram_range, tfidf__use_idf=True) return pipeline
def train_imbalance( descr_series: pd.Series, classes_codes: pd.Series, TFIDF_, IMB_, FS_, req_percentage: int, CLF_, model_name: str, ) -> dict: """ Trains models using handled setting and saves them as .sav objects. Parameters: ---------- instance: Instance of User model; descr_series: description series; classes_codes: series with classes' codes; TFIDF_: vectorizer; IMB_: SMOTE instance; FS_: ranking terms method; req_percentage: percentage to be taken from the ranked list; CLF_: classifier; model_name: models name. Returns: ---------- Trained model in byte representation associated to its model name. """ transformer = feature_selection.SelectPercentile(FS_) clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_), ("fs", transformer), ("clf", CLF_)]) clf_model.set_params(fs__percentile=req_percentage).fit( descr_series, classes_codes) return {model_name: clf_model}
def training_imbalance(descr_series, classes_codes, TFIDF_, IMB_, FS_, req_percentage, CLF_, model_path): """ Trains models using handled setting and saves them as .sav objects. Parameters: descr_series(Series): description series; classes_codes(Series): series with classes' codes; TFIDF_: vectorizer; IMB_: SMOTE method; FS_: ranking terms method; req_percentage(int): percentage to be taken from the ranked list; CLF_: classifier; model_path(str): the path to the model. """ transformer = feature_selection.SelectPercentile(FS_) clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_), ('fs', transformer), ('clf', CLF_)]) clf_model.set_params(fs__percentile=req_percentage).fit( descr_series, classes_codes) dump(clf_model, open(model_path + '.sav', 'wb'))
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([('mock', transf1)]) assert pipeline.named_steps['mock'] is transf1 # Directly setting attr pipeline.steps = [('mock2', transf2)] assert 'mock' not in pipeline.named_steps assert pipeline.named_steps['mock2'] is transf2 assert [('mock2', transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[('mock', transf1)]) assert [('mock', transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [('mock', transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[('junk', ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([('mock', transf1)]) assert pipeline.named_steps['mock'] is transf1 # Directly setting attr pipeline.steps = [('mock2', transf2)] assert 'mock' not in pipeline.named_steps assert pipeline.named_steps['mock2'] is transf2 assert [('mock2', transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[('mock', transf1)]) assert [('mock', transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [('mock', transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[('junk', ())]) assert_raises(TypeError, pipeline.fit, [[1]], [1]) assert_raises(TypeError, pipeline.fit_transform, [[1]], [1])
def resampling(X, Y, r): # print(sorted(Counter(Y).items())) smote_enn = TomekLinks() X_resampled, y_resampled = smote_enn.fit_resample(X, Y) #print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled # pipeline pipeline = Pipeline([ ('und', RandomUnderSampler()), #('power', preprocessing.PowerTransformer()), ('standardize', preprocessing.StandardScaler()), ('normalizer', preprocessing.Normalizer()), ('lda', LinearDiscriminantAnalysis()), #('logistic', sk.linear_model.SGDClassifier(loss="hinge", eta0=1, learning_rate="constant", penalty='l2')) ('svm', LinearSVC(verbose=0, max_iter=3000, class_weight='balanced')), ]) com_values = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10] for c in com_values: pipeline.set_params(svm__C=c, und__random_state=42).fit(X_train, Y_train) # clf = CalibratedClassifierCV(base_estimator=pipeline, cv=10).fit(X,Y) y_p = pipeline.decision_function(X_dev) y_pred = pipeline.predict(X_dev) print("With:", c) print("Confusion matrix:\n", sk.metrics.confusion_matrix(Y_dev, y_pred)) one = sk.metrics.recall_score(Y_dev, y_pred, pos_label=0) two = sk.metrics.recall_score(Y_dev, y_pred, pos_label=1) print("UAR:", (one + two) / 2, "\n")
class Model: def training_model(self, model_path, data_path, colums, resolution, sw=text.ENGLISH_STOP_WORDS, log=0): try: self.creater = ConfigCreator() smt = SMOTE(ratio='minority', random_state=0, kind='borderline1', n_jobs=4) svm_imb = SVC(gamma=2, C=1, probability=True, class_weight='balanced') tfidf_imb = StemmedTfidfVectorizer(norm='l2', sublinear_tf=True, stop_words=sw, analyzer='word', max_df=0.5, max_features=500) anova = feature_selection.f_classif chi2 = feature_selection.chi2 if not os.path.exists(model_path): os.makedirs(model_path) self.data = pandas.read_pickle(data_path) binary_logger = BynaryTrainLogger() multiple_logger = MultipleTrainLogger() filter = Filter() self.creater.create_config( 'single_mod.ini', section_name='single_mod.ini'.split('.')[0]) # models training self.creater.update_setting('single_mod.ini', 'single_mod.ini'.split('.')[0], 'binary_col_class', ','.join(['0', '1'])) # areas # to exclude columns with quite low percent of 1 clear_columns = [ column for column in colums if self.data[column][self.data[column] == 1].size / self.data[column].size > 0.005 ] self.creater.update_setting('single_mod.ini', 'single_mod.ini'.split('.')[0], 'columns', ','.join(clear_columns)) for col in clear_columns: self.data['Description_tr'] = self.data['Description_tr'] Model.training_imbalance_kf(self, self.data['Description_tr'], self.data[col], tfidf_imb, smt, chi2, 50, svm_imb, secure_filename(col), model_path) if log == 1: binary_logger.log('{}.log'.format(session.sid), self.data['Description_tr'], self.data[col], col, model_path) # priority # to exclude rows with quite low percent of any categorical value self.valid_set = [ el for el in self.data['Priority'].unique().tolist() if self.data['Priority'][self.data['Priority'] == el].size / self.data['Priority'].size > 0.005 ] self.data_after_filter = self.data[self.data['Priority'].isin( self.valid_set)] if self.data.shape[0] > self.data_after_filter.shape[0]: self.data_after_filter = filter.reindex_data( self.data_after_filter) self.data_after_filter['Priority_ord'] = self.data_after_filter[ 'Priority'].astype("category") self.data_after_filter['Priority_ord_codes'] = pandas.Categorical( self.data_after_filter['Priority_ord']).codes # train Model.training_imbalance_kf( self, self.data_after_filter['Description_tr'], self.data_after_filter['Priority_ord_codes'], tfidf_imb, smt, chi2, 50, svm_imb, 'priority', model_path) self.creater.update_setting( 'single_mod.ini', 'single_mod.ini'.split('.')[0], 'prior_col_class', ','.join(self.data_after_filter['Priority'].unique().tolist())) # log if log == 1: # print('priority') # start = time.clock() multiple_logger.log( '{}.log'.format(session.sid), self.data_after_filter['Description_tr'], self.data_after_filter['Priority_ord_codes'], 'priority', model_path) # print(time.clock()-start) bins = 4 self.ldis = [i for i in range(1, bins + 1)] # ttr # to exclude rows with quite low percent of any categorical value self.data['temp_ttr_class'] = pandas.qcut(self.data['ttr_tr'], bins, labels=self.ldis, duplicates='drop') self.valid_set = [ el for el in self.data['temp_ttr_class'].unique().tolist() if self.data['temp_ttr_class'][self.data['temp_ttr_class'] == el].size / self.data['temp_ttr_class'].size > 0.005 ] self.data_after_filter = self.data[ self.data['temp_ttr_class'].isin(self.valid_set)] if self.data.shape[0] > self.data_after_filter.shape[0]: self.data_after_filter = filter.reindex_data( self.data_after_filter) # train Model.training_imbalance_kf( self, self.data_after_filter['Description_tr'], self.data_after_filter['temp_ttr_class'], tfidf_imb, smt, chi2, 50, svm_imb, 'ttr', model_path) self.ttr_col_classTemp = pandas.qcut( self.data_after_filter['ttr_tr'], 4, duplicates='drop').unique() self.creater.update_setting( 'single_mod.ini', 'single_mod.ini'.split('.')[0], 'ttr_col_class', ','.join([ str(Model.ifZero(self, self.ttr_col_classTemp[el].left)) + '-' + str(Model.ifZero(self, self.ttr_col_classTemp[el].right)) for el in range(3) ] + [ '>' + str( Model.ifZero( self, self.ttr_col_classTemp[range(3)[-1]].right)) ])) # log if log == 1: multiple_logger.log('{}.log'.format(session.sid), self.data_after_filter['Description_tr'], self.data_after_filter['temp_ttr_class'], 'ttr', model_path) # resolution # resolution may have values wich can't be correctly processed by the system like: "Won't Fix" # therefore we have to convert them to model name via secure_filename() function self.bin_data = pandas.get_dummies(self.data, prefix=list(resolution.keys()), columns=list(resolution.keys())) # to exclude columns with quite low percent of 1 clear_columns = {} for key in resolution: if isinstance(resolution[key], list): resolutions = [] for rez in resolution[key]: if self.bin_data[key + '_' + rez][self.bin_data[ key + '_' + rez] == 1].size / self.bin_data[ key + '_' + rez].size > 0.005: resolutions.append(rez) if len(resolutions) == 1: clear_columns[key] = resolutions[0] if len(resolutions) > 1: clear_columns[key] = resolutions else: if self.bin_data[key + '_' + resolution[key]][ self.bin_data[key + '_' + resolution[key]] == 1].size / self.bin_data[ key + '_' + resolution[key]].size > 0.005: clear_columns[key] = resolution[key] for key in clear_columns: if isinstance(clear_columns[key], list): for res in clear_columns[key]: Model.training_imbalance_kf( self, self.bin_data['Description_tr'], self.bin_data[key + '_' + res], tfidf_imb, smt, chi2, 50, svm_imb, secure_filename(res), model_path) self.creater.update_setting( 'single_mod.ini', 'single_mod.ini'.split('.')[0], res + '_col_class', ','.join(['not ' + res, res])) if log == 1: binary_logger.log('{}.log'.format(session.sid), self.bin_data['Description_tr'], self.bin_data[key + '_' + res], secure_filename(res), model_path) else: Model.training_imbalance_kf( self, self.bin_data['Description_tr'], self.bin_data[key + '_' + clear_columns[key]], tfidf_imb, smt, chi2, 50, svm_imb, secure_filename(clear_columns[key]), model_path) self.creater.update_setting( 'single_mod.ini', 'single_mod.ini'.split('.')[0], clear_columns[key] + '_col_class', ','.join( ['not ' + clear_columns[key], clear_columns[key]])) if log == 1: binary_logger.log( '{}.log'.format(session.sid), self.bin_data['Description_tr'], self.bin_data[key + '_' + clear_columns[key]], secure_filename(clear_columns[key]), model_path) self.data.to_pickle(data_path) except FileNotFoundError: raise def training_imbalance_kf(self, X_, Y_, TFIDF_, IMB_, FS_, pers_, CLF_, name_, model_path): self.transform = feature_selection.SelectPercentile(FS_) self.clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_), ('fs', self.transform), ('clf', CLF_)]) kf = KFold(n_splits=10) kf.get_n_splits(X_) #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_,Y_,train_size=.8, stratify=Y_) for train_index, test_index in kf.split(X_): self.X_train, self.X_test = X_[train_index], X_[test_index] self.y_train, self.y_test = Y_[train_index], Y_[test_index] self.clf_model.set_params(fs__percentile=pers_).fit( self.X_train, self.y_train) pickle.dump(self.clf_model, open(model_path + name_ + '.sav', 'wb')) #y_pred = clf_model.predict(X_test) #print(classification_report(y_test, y_pred)) def ifZero(self, val): if val < 0: return 0 else: return val def proc_text(self, text, col_class, name_model, model_path): self.test_pro = text try: with open('regularExpression.csv') as csv_data: for i in [ re.compile(el1) for el in csv.reader( csv_data, delimiter=',', quotechar='"') for el1 in el if el1 ]: self.test_pro = re.sub(i, ' ', self.test_pro) self.proba = {} sys.path.append("..") self.load_model_test = pickle.load( open(model_path + name_model + '.sav', 'rb')) self.proba_ = list( numpy.array(numpy.around( self.load_model_test.predict_proba([self.test_pro])[0], 3), dtype=float).flatten()) self.proba_dic = dict(zip(col_class, self.proba_)) return self.proba_dic except Exception: raise def create_top_terms_file(self, frame, resolution): checker = Checker() chi2 = feature_selection.chi2 SW = text.ENGLISH_STOP_WORDS config_reader = SettingProvider('single_mod.ini') resol_all = [] for el in checker.get_resolutions(resolution): resol_all += config_reader.get_setting( section='single_mod', setting="{el}_col_class".format(el=el), evaluate=False).split(',') resol = [el for el in resol_all if 'not' not in el] prio = config_reader.get_setting(section='single_mod', setting='prior_col_class', evaluate=False).split(',') areas = config_reader.get_setting(section='single_mod', setting='columns', evaluate=False).split(',') all_terms = prio + resol + areas all_mass = [] bin_data = pandas.get_dummies( frame, prefix=list(resolution.keys()) + ['Priority'], columns=list(resolution.keys()) + ['Priority']) with open('top_terms.csv', 'w', newline='\n') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(all_terms) for el in all_terms: if el in prio: prior = self.top_terms(bin_data, 'Priority_' + el, chi2, SW) all_mass.append(prior) if el in resol: key = None for key1 in resolution: if isinstance(resolution[key1], list): if el in resolution[key1]: key = key1 else: if el == resolution[key1]: key = key1 resol = self.top_terms(bin_data, key + '_' + el, chi2, SW) all_mass.append(resol) if el in areas: area = self.top_terms(bin_data, el, chi2, SW) all_mass.append(area) rows = zip_longest(*all_mass) for row in rows: csvwriter.writerow(row) def top_terms(self, data, field, func, SW): tfidf = StemmedTfidfVectorizer(norm='l2', sublinear_tf=True, min_df=1, stop_words=SW, analyzer='word', max_features=1000) # bidata = pandas.get_dummies(data, prefix=field+'_', columns=field) multithreaded = Multithreaded() clear_data = ClearData() parall_data = multithreaded.parallelize(data['Description_tr'], clear_data.clean_descr) tfs = tfidf.fit_transform(parall_data) y = data[field] selector = SelectKBest(score_func=func, k='all') selector.fit_transform(tfs, y) X_new = dict(zip(tfidf.get_feature_names(), selector.scores_)) temp_dict = sorted(X_new.items(), key=lambda x: x[1], reverse=True) rez = [] mean = [] for el in temp_dict[:]: if el[1] > 1: rez.append(el) mean.append(el[1]) import numpy return [el[0] for el in rez if el[1] > numpy.mean(mean)]
def get_feature_pipeline(params: SingleBaseParams): from imblearn.pipeline import Pipeline from sklearn.decomposition import TruncatedSVD feature_name = params.get_feature_name() if feature_name == publication_year_key: from features.text_processor import PublicationYearTransformer steps = [('t', PublicationYearTransformer())] else: steps = [ ('vect', CountVectorizer(tokenizer=TextTokenizer().preprocess)), ('tfidf', TfidfTransformer()), ] if params.method != "none": steps.append(("os", get_balancing_step(params.method))) if params.pca: steps.append(('pca', TruncatedSVD())) from classifier.models import get parameters = {} if params.classifier.startswith('SVC'): parameters['probability'] = True if params.get_balanced(): parameters['class_weight'] = 'balanced' steps.append(('clf', get(params.classifier, params=parameters))) pipeline = Pipeline(steps) if feature_name == publication_year_key: pass elif feature_name in [publication_type_key, mesh_headings_key]: pipeline.set_params(vect__ngram_range=(1, 1)) pipeline.set_params(tfidf__use_idf=True) elif feature_name in [ title_key, title_most_replaced_key, title_disease_or_syndrome_replaced_key, journal_title_key ]: pipeline.set_params(vect__ngram_range=(1, 2)) pipeline.set_params(tfidf__use_idf=True) else: pipeline.set_params(vect__ngram_range=(1, 4)) pipeline.set_params(tfidf__use_idf=True) return pipeline
def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method error_regex = ("Last step of Pipeline should implement fit or be the " "string 'passthrough'") with raises(TypeError, match=error_regex): Pipeline([("clf", NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([("svc", clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC(gamma="scale") filter1 = SelectKBest(f_classif) pipe = Pipeline([("anova", filter1), ("svc", clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = "implement fit and transform or fit_resample" with raises(TypeError, match=error_regex): Pipeline([("t", NoTrans()), ("svc", clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps["svc"] is pipe2.named_steps["svc"] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop("svc") params.pop("anova") params2.pop("svc") params2.pop("anova") assert params == params2
def optimize(self, samp, clf, tag): """ 单次实验流程 """ # 打印头部信息 os.system('cls||clear') self.tag = tag tqdm.write("""{div}\n Stack Expert Model Optimization {indent}--Seed: {seed} {indent}--Sampling Method: {samp} {indent}--Classification Method: {clf} {indent}--Tag: {tag}\n{div} """.format( **{ 'div': '*' * 50, 'seed': self.seed, 'indent': '\b' * 6, 'tag': self.tag, 'samp': type(samp).__name__, 'clf': type(clf).__name__ })) # 加载数据 data, target, ratio = load_data(self.tag) tqdm.write( "[INFO] Data loads complete. Expert ratio:{:.2f}%\t{:s}".format( 100 * ratio, cur())) # 设置随机数种子 self.seed = int(time()) self.validator.random_state = self.seed self._check(type(clf).__name__) samp.set_params(**{"random_state": self.seed}) if 'random_state' in clf.get_params().keys(): clf.set_params(**{"random_state": self.seed}) # 建立过采样和分类器的流水线模型 pipeline = Pipeline([(type(samp).__name__, samp), (type(clf).__name__, clf)]) tqdm.write( "[INFO] Model load completed. Start grid search...\t{:s}".format( cur())) # 开始进行网格搜索 for ind, (key, value) in enumerate( tqdm(self._get_grid(pipeline, ratio).items())): tqdm.write('-' * 15 + 'Epoch {:d}'.format(ind) + '-' * 15) # 设置默认参数 pipeline.set_params(**self._get_params(pipeline, ratio)) # 建立网格搜索对象 grid_opti = GridSearchCV(estimator=pipeline, param_grid={key: value}, cv=self.validator, **self.params['GridSearchCV']) tqdm.write("[EP{:d}] Search Paramator: {:}\t{:s}".format( ind, key, cur())) tqdm.write("[EP{:d}] Search Grid: {:}\t{:s}".format( ind, str(value) + " Fitting...", cur())) # 拟合模型 grid_opti.fit(data.to_numpy(), target) # 输出最佳参数及对应实验指标 df_res = pd.DataFrame(grid_opti.cv_results_) df_res = df_res.loc[df_res['mean_test_{:s}'.format( self.scoring)].idxmax()] tqdm.write( "[EP{:d}] Fit complete. Current Score: {:}\t{:s}".format( ind, df_res['mean_test_{:s}'.format(self.scoring)], cur())) tqdm.write("\r[EP{:d}] Best: {:}\t{:s}".format( ind, df_res['params'], cur())) if '{}__sampling_strategy'.format( type(samp).__name__) in df_res['params']: df_res['params']['{}__sampling_strategy'.format( type(samp).__name__)] /= ratio # 更新参数 self._set_params(df_res['params']) # 存储实验结果 self._rec( type(clf).__name__, type(samp).__name__, df_res.filter(regex=r'^mean_test', axis=0)) # 完成网格搜索,一次实验结束 tqdm.write("{:s}\n[INFO] Grid search complete.{:}\t{:s}\n".format( '=' * 50, "", cur())) del data, target
def train_models(self, X_train, y_train, tuning='hyperopt'): """Hyperparameter tuning. Iterate over each model and find best parameter combination using 'tuning' method and cross validation Also finally fits a voting classifier with all the optimized models """ valid_tuning = ['random', 'grid', 'hyperopt'] if tuning not in valid_tuning: raise ValueError( 'train_models: tuning must be one of {}'.format(valid_tuning)) start = datetime.now() self.best_estimators = {} self.best_f1_scores = {} self.scores_all = [] for model, estimator in self.models_selector(tuning): self.best = 0 model_pipeline = Pipeline([('imputer', self.imputer), ('scaling', self.scaler), ('rus', self.rus), (model, estimator['est'])]) if tuning == 'random': search = RandomizedSearchCV( model_pipeline, param_distributions=estimator['params'], scoring='f1', n_iter=100, cv=10, verbose=False, n_jobs=-1, iid=False) self.models_fit(model, search, X_train, y_train, tuning) elif tuning == 'grid': search = GridSearchCV(model_pipeline, param_grid=param_dist, scoring='f1', cv=10, verbose=True, n_jobs=-1) self.models_fit(model, search, X_train, y_train, tuning) elif tuning == 'hyperopt': hyperopt_objective = partial(self.raw_hyperopt_objective, model_pipeline, X_train, y_train) trials = Trials() best_params = fmin(fn=hyperopt_objective, space=estimator['params'], algo=tpe.suggest, max_evals=20, trials=trials) best_params_actual = space_eval(estimator['params'], trials.argmin) model_pipeline.set_params(**best_params_actual) self.models_fit(model, model_pipeline, X_train, y_train, tuning, trials) # Training a Voting classifier with all above estimators self.vot_model = VotingClassifier(estimators=list( self.best_estimators.items()), voting='hard') scores = cross_val_score(self.vot_model, X_train, y_train, scoring='f1', cv=10) self.vot_model.fit(X_train, y_train) self.best_estimators['voting'] = self.vot_model print('Voting Classifier F1 score mean: {:.4f}, stddev: {:.4f}'.format( scores.mean(), scores.std())) end = datetime.now() print('Train time: {}'.format(end - start))
def manual_partial_model_selection(main_path, dataset_type, dataset_sub_type, sampling, sampling_timing, fs_step_name, classifier_step_name, chromosome): print("##### Experiment Info #####") print("Chromosome:", chromosome) print("Dataset type:", dataset_type) print("Dataset subtype:", dataset_sub_type) print("Sampling:", sampling) print("Sampling timing:", sampling_timing) print("Filter FS:", fs_step_name) print("Classifier:", classifier_step_name) print() print("Loading variable names...") print() with open( main_path + dataset_type + '/' + dataset_sub_type + '/' + chromosome + '/train_train.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: variable_names = np.array(list(row)) break variable_names = variable_names[1:] print("Variable names size:", len(variable_names)) print() sampling_seeds = [123, 456, 789] print("Loading training data...") print() train_data = load_dataset(main_path + dataset_type + '/' + dataset_sub_type + '/' + chromosome + '/train_train.csv') X_train = train_data[:, 1:] print("X_train shape:", X_train.shape) print() y_train = train_data[:, 0] print("y_train shape:", y_train.shape) print() experiment_results = dict() print("Creating pipeline...") print() pipe = Pipeline([("imputer", Imputer(missing_values=-1)), ("variance", VarianceThreshold()), ("scaler", StandardScaler())]) if fs_step_name == "anova": filter = SelectPercentile(f_classif, percentile=2) if sampling_timing == "sampling_before_fs": if sampling == "down_sample": pipe.steps.append( (sampling, RandomUnderSampler(random_state=sampling_seeds[0]))) elif sampling == "up_sample": pipe.steps.append( (sampling, RandomOverSampler(random_state=sampling_seeds[1]))) elif sampling == "smote_sample": pipe.steps.append( (sampling, SMOTE(n_jobs=-1, random_state=sampling_seeds[2]))) pipe.steps.append((fs_step_name, filter)) elif sampling_timing == "sampling_after_fs": pipe.steps.append((fs_step_name, filter)) if sampling == "down_sample": pipe.steps.append( (sampling, RandomUnderSampler(random_state=sampling_seeds[0]))) elif sampling == "up_sample": pipe.steps.append( (sampling, RandomOverSampler(random_state=sampling_seeds[1]))) elif sampling == "smote_sample": pipe.steps.append( (sampling, SMOTE(n_jobs=-1, random_state=sampling_seeds[2]))) classifier = SVC(kernel='linear', random_state=123456, probability=True, class_weight='balanced') pipe.steps.append((classifier_step_name, classifier)) print("Performing manual gridsearch...") print() C_OPTIONS = [0.001, 0.01, 0.1, 1, 10, 100, 1000] cv = StratifiedKFold(n_splits=5, random_state=123456) f1_cv = [] mean_test_score = [] std_test_score = [] for C in C_OPTIONS: pipe.set_params(linear_svm__C=C) for train_indexes, validation_indexes in cv.split(X_train, y_train): pipe.fit(X_train[train_indexes, :], y_train[train_indexes]) y_pred = pipe.predict(X_train[validation_indexes, :]) f1 = f1_score(y_train[validation_indexes], y_pred, average='weighted') f1_cv.append(f1) mean_test_score.append(np.mean(f1_cv)) std_test_score.append(np.std(f1_cv)) cv_results = dict() cv_results['mean_test_score'] = mean_test_score cv_results['std_test_score'] = std_test_score cv_results['params'] = C_OPTIONS experiment_results['cv_results'] = cv_results print("Manual gridsearch results:") print() print(cv_results['mean_test_score']) print() print(cv_results['std_test_score']) print() print("Best parameters set found on development set:") print() print(C_OPTIONS[np.argmax(mean_test_score)]) print()
params_space = { 'undersampler__n_neighbors': quniform_int('n_neighbors', 2, 10, 1), 'xgb__max_depth': quniform_int('max_depth', 10, 30, 1), 'xgb__min_child_weight': hp.quniform('min_child_weight', 1, 20, 1), 'xgb__subsample': hp.uniform('subsample', 0.8, 1), 'xgb__n_estimators': quniform_int('n_estimators', 1000, 10000, 50), 'xgb__learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001, 'xgb__gamma': hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001, 'xgb__colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05) } model.set_params({ 'xgb__colsample_bytree': 0.55, 'xgb__n_estimators': 1000, 'xgb__subsample': 0.81758885827, 'xgb__min_child_weight': 2.0, 'xgb__learning_rate': 0.0091861014503, 'xgb__gamma': 1.19618674618, 'undersampler__n_neighbors': 7, 'xgb__max_depth': 21 })
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method assert_raises_regex( TypeError, 'Last step of Pipeline should implement fit. ' '.*NoFit.*', Pipeline, [('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex(TypeError, 'implement fit and transform or sample', Pipeline, [('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert params == params2
filts = [ ('RandomForestClassifier', RandomForestClassifier(n_estimators=25, random_state=random_state)), ('RandomForestClassifier', RandomForestClassifier(n_estimators=10, random_state=random_state)), ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=random_state)), ('LogisticRegression', LogisticRegression(solver='lbfgs', random_state=random_state, multi_class='auto', max_iter=750)), ('MLPClassifier', MLPClassifier(random_state=random_state, max_iter=2000)) ] # set up model znorm = StandardScaler() ownmethod = MBKMeansFilter_reversed() rfc = RandomForestClassifier(n_estimators=500, random_state=random_state) clf = Pipeline([('ZNorm', znorm), ('OwnMethod2',ownmethod), ('RFC', rfc)]) clf.set_params(**params) ## model clf.fit(X, y, **{'OwnMethod2__filters': filts}) pickle.dump(clf, open(MODELS_PATH+'near_final_clf_.pkl','wb')) ## model2 znorm = StandardScaler() rfc = RandomForestClassifier(n_estimators=250, random_state=random_state, n_jobs=-1) clf = Pipeline([('ZNorm', znorm), ('RFC', rfc)]) clf.fit(X, y) pickle.dump(clf, open(MODELS_PATH+'final_RFC.pkl','wb')) # ---------------------------------------------------------------------------- # # Cross Spatial Validation # ---------------------------------------------------------------------------- #
def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*' with raises(TypeError, match=error_regex): Pipeline([('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = 'implement fit and transform or sample' with raises(TypeError, match=error_regex): Pipeline([('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert params == params2