def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(None) assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform') pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform')
def test_make_pipeline_memory(): cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None finally: shutil.rmtree(cachedir)
def svm_benchmark(isoform_list): start = time.time() columns_ = [ 'ACC', 'BA', 'ROC-AUC', 'PR-AUC', 'MCC', 'SN', 'SP', 'PR', 'F1', 'CK' ] df = pd.DataFrame(columns_) isoform_list_ = isoform_list for isoform_ in isoform_list_: #-------------------- X_train = np.load("./data/{}/train_data.npy".format(isoform_)) X_val = np.load("./data/{}/val_data.npy".format(isoform_)) X_test = np.load("./data/{}/test_data.npy".format(isoform_)) y_train = np.load("./data/{}/train_label.npy".format(isoform_)) y_val = np.load("./data/{}/val_label.npy".format(isoform_)) y_test = np.load("./data/{}/test_label.npy".format(isoform_)) #-------------------- # Set Up Parameter my_C = [0.001, 0.01, 0.1, 1, 10, 100] my_gamma = [0.001, 0.01, 0.1, 1, 10, 100] pred_val_list = [] para_list = [] for p1 in my_C: for p2 in my_gamma: para_list.append((p1, p2)) my_classifier = make_pipeline( VarianceThreshold(threshold), SVC(C=p1, gamma=p2, probability=True)) pred_val = my_classifier.fit(X_train, y_train).predict_proba(X_val)[::, 1] pred_val_list.append(list(pred_val)) #-------------------- auc_val_list = [] for pred in pred_val_list: auc = roc_auc_score(y_val, pred) auc_val_list.append(auc) i = np.argmax(auc_val_list) #-------------------- best_C = para_list[i][0] best_gamma = para_list[i][1] tuned_classifier = make_pipeline( VarianceThreshold(threshold), SVC(C=best_C, gamma=best_gamma, probability=True)) pred_test = tuned_classifier.fit(X_train, y_train).predict_proba(X_test)[::, 1] #-------------------- metric = printPerformance(y_test, pred_test) df1 = pd.DataFrame(metric) df = pd.concat([df, df1], axis=1) df.columns = ["Metrics"] + isoform_list_ df.to_csv("svm_benchmark.csv", index=None) end = time.time() processing_time = (end - start) print("Processing time: {}".format(processing_time))
def __build_preprocessor(self, useSelector): """ :return: """ extractor = self.__build_extractor() if (useSelector): selector = self.__build_selector() return make_pipeline(extractor, selector) else: return make_pipeline(extractor)
def print_metrics(model, X, y, scoring='f1', oversample=False): if oversample == True: pipeline = make_pipeline(StandardScaler(), RandomOverSampler(random_state=11), model) else: pipeline = make_pipeline(StandardScaler(), model) score = cross_val_score(pipeline, X, y, scoring=scoring) fitted_model = model.fit(X, y) cm = confusion_matrix(y, fitted_model.predict(X)) print(score) print(cm)
def rf_benchmark(isoform_list): start = time.time() columns_ = [ 'ACC', 'BA', 'ROC-AUC', 'PR-AUC', 'MCC', 'SN', 'SP', 'PR', 'F1', 'CK' ] df = pd.DataFrame(columns_) isoform_list_ = isoform_list for isoform_ in isoform_list_: #-------------------- X_train = np.load("./data/{}/train_data.npy".format(isoform_)) X_val = np.load("./data/{}/val_data.npy".format(isoform_)) X_test = np.load("./data/{}/test_data.npy".format(isoform_)) y_train = np.load("./data/{}/train_label.npy".format(isoform_)) y_val = np.load("./data/{}/val_label.npy".format(isoform_)) y_test = np.load("./data/{}/test_label.npy".format(isoform_)) #-------------------- # Set Up Parameter my_n_estimators = np.arange(25, 201, 25) pred_val_list = [] para_list = [] for p in my_n_estimators: para_list.append(p) my_classifier = make_pipeline( VarianceThreshold(threshold), RandomForestClassifier(random_state=42, n_estimators=p)) pred_val = my_classifier.fit(X_train, y_train).predict_proba(X_val)[::, 1] pred_val_list.append(list(pred_val)) #-------------------- auc_val_list = [] for pred in pred_val_list: auc = roc_auc_score(y_val, pred) auc_val_list.append(auc) i = np.argmax(auc_val_list) #-------------------- best_n_estimators = para_list[i] tuned_classifier = make_pipeline( VarianceThreshold(threshold), RandomForestClassifier(random_state=42, n_estimators=best_n_estimators)) pred_test = tuned_classifier.fit(X_train, y_train).predict_proba(X_test)[::, 1] #-------------------- metric = printPerformance(y_test, pred_test) df1 = pd.DataFrame(metric) df = pd.concat([df, df1], axis=1) df.columns = ["Metrics"] + isoform_list_ df.to_csv("rf_benchmark.csv", index=None) end = time.time() processing_time = (end - start) print("Processing time: {}".format(processing_time))
def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt"
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transf-1") assert_equal(pipe.steps[1][0], "transf-2") pipe = make_pipeline(t1, t2, FitParamT()) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transf-1") assert_equal(pipe.steps[1][0], "transf-2") assert_equal(pipe.steps[2][0], "fitparamt")
def test_make_pipeline(): t1 = TransfT() t2 = TransfT() pipe = make_pipeline(t1, t2) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transft-1") assert_equal(pipe.steps[1][0], "transft-2") pipe = make_pipeline(t1, t2, FitParamT()) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transft-1") assert_equal(pipe.steps[1][0], "transft-2") assert_equal(pipe.steps[2][0], "fitparamt")
def test_pipeline_fit_then_sample_of_three_samplers_with_sampler_last_estimator(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X,y) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X,y) X_fit_then_sample_resampled, y_fit_then_sample_resampled = pipeline.sample(X,y) assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_resampled) assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_resampled)
def data_sampling(X, Y, k, oversampling=True, undersampling=True, class_weight='balanced'): over = SMOTE(sampling_strategy=0.55, k_neighbors=k) # environ 55% du jeu de données under = RandomUnderSampler(sampling_strategy=1.) # les effectifs de la classe minoritaire sont 50% de ceux de la classe majoritaire if (oversampling and undersampling): pipe=make_pipeline(over, under) X1, Y1 = pipe.fit_resample(X, Y) elif oversampling: pipe=make_pipeline(over) X1, Y1 = pipe.fit_resample(X, Y) elif undersampling: pipe=make_pipeline(under) X1, Y1 = pipe.fit_resample(X, Y) elif (class_weight=='balanced' or class_weight==None): (X1, Y1)=(X, Y) return X1, Y1
def sample(ngram_range=(1, 2), n_features=85000, methods=[]): from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import StratifiedShuffleSplit from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import f1_score from imblearn.pipeline import make_pipeline methodsResults = pandas.DataFrame() counter = -2 for osm in methods: counter += 1 sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=3000) # confusion_sum = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] result = [] for train_index, test_index in sss.split(x, y): cvec = CountVectorizer() cvec.set_params(max_features=n_features, ngram_range=ngram_range) clf = MultinomialNB() if (osm == 0): pipeline = make_pipeline(cvec, clf) else: pipeline = make_pipeline(cvec, osm, clf) # X = cvec.fit_transform(x) x_train, x_test = x[train_index], x[test_index] # X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] sentiment_fit = pipeline.fit(x_train, y_train) y_pred = sentiment_fit.predict(x_test) # conmat = np.array(confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0])) # print(conmat) f1 = f1_score(y_test, y_pred, labels=[2.0, 3.0, 4.0], average="micro") result.append(f1) # print(result) if (osm == 0): methodsResults["Base Case"] = result else: methodsResults[type(osm).__name__] = result # print(confusion_sum) return methodsResults
def naive_bayse_cross(train_x, train_y, validation, test, test_data): print("training data...") clf_pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)), RandomUnderSampler(), MultinomialNB(alpha=0.01)) scores = cross_val_score(clf_pipe, train_x, train_y, cv=10) print("Model is fitted!") if validation: print("scores: ", scores) print("std of score: ", np.std(scores)) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) y_pred = cross_val_predict(clf_pipe, train_x, train_y, cv=5) # Evaluation # classification report print("classification reports:", classification_report(train_y, y_pred)) # confusion matrix conf_mat = confusion_matrix(train_y, y_pred) print(conf_mat) plot_conf(conf_mat) if test: naive_bayes(test_data)
def create_pipeline(model, sampling_strategy, y): """Wraps a model in a pipeline to resample training data. Args: model (sklearn Model): The model to wrap. sampling_strategy (SamplingStrategy): The sampling strategy for the pipeline. y (pandas Dataframe): A dataframe containing targets. Returns: sklearn pipeline: A pipeline wrapping the model. """ balancer = 'passthrough' if sampling_strategy == SamplingStrategy.UNDERSAMPLING: # We want to use a random undersampler if we use # undersampling is the resample strategy databalancing_stats(y, sampling_strategy) balancer = RandomUnderSampler(random_state=SEED) elif sampling_strategy == SamplingStrategy.OVERSAMPLING: # We want to use a SMOTE, the most common oversampler, # if we use oversampling is the resample strategy databalancing_stats(y, sampling_strategy) balancer = SMOTE(random_state=SEED, n_jobs=-1) return make_pipeline(balancer, model)
def svm (X_train, X_test, y_train, y_test): svm_kernel = 'poly' param_grid = {'svr__C': [10], 'svr__gamma': [0.01], } regr = make_pipeline(StandardScaler(), SVR(kernel=svm_kernel)) grid = GridSearchCV(regr, param_grid, n_jobs=-1, return_train_score=True) grid = grid.fit(X_train, y_train) y_pred_test = grid.predict(X_test) y_pred_train = grid.predict(X_train) MSE_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test) MSE_train = mean_squared_error(y_true=y_train, y_pred=y_pred_train) print(grid) # print('cv_results_: ', grid.cv_results_) # print('Best score: ', grid.best_score_) # print('Best parameter: ', grid.best_estimator_) # print('Best parameters: ', grid.best_params_) print('MSE train: %.2f , MSE test: %.2f' % (MSE_train, MSE_test)) test_scores = grid.cv_results_['mean_test_score'] train_scores = grid.cv_results_['mean_train_score'] print('test_scores:', test_scores) print('train_scores:', train_scores) return test_scores, train_scores, MSE_train, MSE_test
def svm(X_train, X_test, y_train, y_test, param_grid): regr = make_pipeline(StandardScaler(), SVR()) grid = GridSearchCV(regr, param_grid, n_jobs=-1, return_train_score=True) grid = grid.fit(X_train, y_train) # re-predict using best parameter: params = {} params['svr__kernel'] = [grid.best_params_['svr__kernel']] grid = GridSearchCV(regr, params, n_jobs=-1, return_train_score=True) grid = grid.fit(X_train, y_train) y_pred_test = grid.predict(X_test) y_pred_train = grid.predict(X_train) MSE_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test) MSE_train = mean_squared_error(y_true=y_train, y_pred=y_pred_train) # print(grid) # print('cv_results_: ', grid.cv_results_) # print('Best score: ', grid.best_score_) # print('Best parameter: ', grid.best_estimator_) # print('Best parameters: ', grid.best_params_) # print('MSE train: %.2f , MSE test: %.2f' # % (MSE_train, MSE_test)) test_scores = grid.cv_results_['mean_test_score'] train_scores = grid.cv_results_['mean_train_score'] slope, intercept, r_value, p_value, std_err = stats.linregress( y_test, y_pred_test) return test_scores, train_scores, MSE_train, MSE_test, r_value, p_value, std_err
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier(), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def sm_col_clf_piper(X_train, y_train, X_test, X_label, parameters, clf, scoring= 'f1'): # parameters: dict of parameter need to tune # clf: classifier # n_features: number of features want to find, default is half of all features # scoring: type of score using to tune, default is f1 score # SMOTE training set to deal with imbalance sm = SMOTE() X_train, X_label = sm.fit_sample(X_train, X_label) pipe = make_pipeline( (SFS(clf,"best",forward=False,scoring=scoring,cv=5)), (clf) ) # tune model with different parameters grid = GridSearchCV(estimator = pipe, param_grid = parameters, cv = 5, n_jobs = -1, verbose = 50, scoring = scoring) grid.fit(X_train, y_train) # get the selected feature index best_pipe = grid.best_estimator_ feature_idx = (best_pipe.named_steps['sequentialfeatureselector'].transform(np.arange(len(X_train.columns)).reshape(1, -1)))[0] # use best parameter to predict test label pred = grid.predict(X_test) # calculate different score based on prediction conf = confusion_matrix(X_label,pred) test_score = { "accuracy":accuracy_score(X_label,pred), "precision":precision_score(X_label,pred,"binary"), "recall":recall_score(X_label,pred,"binary"), "f1_score":f1_score(X_label,pred,"binary"), "roc_auc":roc_auc_score(X_label,pred) } return grid.cv_results_['mean_test_score'], grid.best_params_, conf , test_score, feature_idx
def train_mnb(X, y, **kwargs): """ This function transforms the text corpus with a TfidfVectorizer and trains a Naive Bayes model. Parameter --------- corpus : array_like List of song lyrics as strings. artists : array_like List of labels/artists as strings. **kwargs : Arbitrary keyword arguments passes as hyperparameters for MultinominalNB. Returns ------- A pipeline with Text-Preprocesser and the trained sklearn.naives_bayes.MultinominalNB classification model. """ tf = TfidfVectorizer() ros = RandomOverSampler(random_state=20) sm = SMOTE(random_state=20) m = MultinomialNB(**kwargs) pipeline = make_pipeline(tf, sm, m) pipeline.fit(X, y) print(f"\ntraining accuracy: {round(pipeline.score(X, y),3)}") #print('\nConfusion matrix:') #print(f'Classes: {pipeline.classes_}') #print(confusion_matrix(y, pipeline.predict(X), labels=pipeline.classes_)) cross_val = cross_val_score(pipeline, X, y, cv=5) print(f'\ncross-validation accuracy: {cross_val.round(3)}') return pipeline
def pipeline(estimator): ''' Model pipeline ''' return make_pipeline(StandardScaler(), RandomOverSampler(random_state=42, ratio='minority'), estimator)
def adjust(dimension=yV, dimName="Valence"): FOLDS = 10 sss = StratifiedShuffleSplit(n_splits=FOLDS, test_size=0.2, random_state=3000) result = [] x = emoBank.drop(columns=["id", "sentence", dimName], inplace=False).values for train_index, test_index in sss.split(x, dimension): clf = MultinomialNB() pipeline = make_pipeline(clf) x_train, x_test = x[train_index], x[test_index] y_train, y_test = dimension[train_index], dimension[test_index] sentiment_fit = pipeline.fit(x_train, y_train) y_pred = sentiment_fit.predict(x_test) f1 = f1_score(y_test, y_pred, labels=[2.0, 3.0, 4.0], average="micro") result.append(f1) avgScore = 0 for score in result: avgScore += score elapsedTime = time.time() - start_time print("elapsed time: " + str(elapsedTime)) print("F1 score for " + str(dimName) + ": " + str(avgScore / FOLDS)) return sentiment_fit
def run(X, y, learning_curve=False, validation_curve=False): RANDOM_STATE = 0 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE) pipeline = make_pipeline( SelectKBest(score_func=f_classif, k=10), QuantileTransformer(), RandomUnderSampler(random_state=RANDOM_STATE), GradientBoostingClassifier(random_state=RANDOM_STATE), ) if learning_curve: ax = plot_learning_curve(pipeline, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2)) plt.show() if validation_curve: ax = plot_validation_curve( pipeline, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2), param_name="selectkbest__k", param_range=[10, 20, 30, 40, 50], ) plt.show()
def decisssionTreeSimpleVal(X, Y): X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8) #ica = FastICA(n_components=K, whiten=True).fit(X_train, Y) #X_red_train = ica.transform(X_train) #X_red_val = ica.transform(X_val) # Normalization scalar = StandardScaler() X_train_n = scalar.fit_transform(X_train) X_val_n = scalar.fit_transform(X_val) undersample = SMOTE() tree = DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=5, random_state=RANDOM_STATE, presort=True) classifier = make_pipeline(undersample, tree) # Prediction and evaluation classifier.fit(X_train_n, Y_train) prediction = classifier.predict(X_val_n) print("\n", classification_report(prediction, Y_val)) print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction)) return classifier, scalar
def create_pipelines(self): self.model_pipelines = [] for estimator in self.estimators: for sampler in self.samplers: for scaler in self.scalers: pipeline = make_pipeline(scaler, sampler, estimator) self.model_pipelines.append(pipeline)
def OverSampling_SMOTE(df): df.replace([np.inf, -np.inf], np.nan, inplace=True) train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] train_df_X = train_df.drop('TARGET', axis=1) train_df_y = train_df.TARGET # SMOTE print('Creating Smote Data...') smote = SMOTE(k_neighbors=5, n_jobs=-1) smote_enn = make_pipeline(SimpleImputer(), SMOTEENN(smote=smote)) X_res, y_res = smote_enn.fit_resample(train_df_X, train_df_y) X_res_df = pd.DataFrame(X_res, columns=train_df_X.columns) train_df_new = X_res_df.join(y_res.to_frame()) df = train_df_new.append(test_df) # Save data to csv file df.to_csv('data/df_prepared_to_model.csv') # Save data to pickle file df.to_pickle("data/df_prepared_to_model.pkl") return df
def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( ValueError, match="Pipeline.fit does not accept the sample_weight parameter", ): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
def svm_text_classification(vec_params, svm_params, train_feat, train_label, test_feat, test_label, random_state=42): ''' A function to classify text data using count vectorization, random under-sampling, and a random forest. train_features = an array of training features test_features = an array of testing features labels = an array of training labels vec_params = parameters for the CountVectorizer rf_params = parameters for the RandomForestClassifier ''' pipe = make_pipeline(CountVectorizer(**vec_params), RandomUnderSampler(random_state=random_state), SVC(**svm_params)) pipe_fit = pipe.fit(train_feat, train_label) y_pred = pipe_fit.predict(test_feat) cnf_matrix = confusion_matrix(test_label, y_pred) return pipe, pipe_fit, y_pred, cnf_matrix
def mlpSimpleDiv(X, Y): X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8) #X_red_train = FastICA(n_components=K, whiten=True).fit_transform(X_train, Y) # X_red_val = FastICA(n_components=K, whiten=True).fit_transform(X_val, Y) # Normalization scalar = StandardScaler() X_train_n = scalar.fit_transform(X_train) X_val_n = scalar.fit_transform(X_val) # MLP creation + trainning scalar = StandardScaler() mlp = MLPClassifier(activation="relu", verbose=False, solver="adam", max_iter=150, hidden_layer_sizes=(3, 200), early_stopping=True, tol=1e-12, validation_fraction=0.2, alpha=1e-4, learning_rate_init=0.1, beta_1=0.3, warm_start=True, random_state=RANDOM_STATE) undersample = SMOTE() classifier = make_pipeline(undersample, mlp) # Prediction and evaluation classifier.fit(X_train_n, Y_train) prediction = classifier.predict(X_val_n) print("\n", classification_report(prediction, Y_val)) print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction)) return classifier, scalar
def set_pipe(clf, features, filename = 'Untitled'): piped_clf = make_pipeline( (ColumnSelector(cols = features)), (SMOTE()), (clf) ) piped_clf.fit(X_train,y_train) y_pred = piped_clf.predict(X_test) con_mat = confusion_matrix(y_test, y_pred) avg_f1 = (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5, scoring = 'f1')).mean() print("Cross Val acc score: ", (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5,)).mean()) print("Cross Val f1 score: ", avg_f1) print() print("Overall Acc score: ", accuracy_score(y_true=y_test, y_pred=y_pred)) print("Recall score (Tru Pos Rate): ", recall_score(y_true=y_test, y_pred=y_pred)) print("Precision score: ", precision_score(y_true=y_test, y_pred=y_pred)) print("Neg Predictive Val: ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0])) print("Tru Neg Rate(Specifi): ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0])) print("F1 score: ", f1_score(y_true=y_test, y_pred=y_pred)) print("Auc score: ", roc_auc_score(y_true=y_test, y_score=y_pred)) print(con_mat) print() (pd.DataFrame(y_pred)).to_csv(filename + 'y_pred_filt_avg.csv') return piped_clf, avg_f1
def svmSimpleVal(X, Y): X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8) #ica = FastICA(n_components=K, whiten=True).fit(X_train, Y) #X_red_train = ica.transform(X_train) #X_red_val = ica.transform(X_val) # Normalization scalar = StandardScaler() X_train_n = scalar.fit_transform(X_train) X_val_n = scalar.fit_transform(X_val) undersample = SMOTE() svm = SVC( verbose=True, kernel="poly", decision_function_shape="ovr", random_state=RANDOM_STATE, C=0.03, degree=3, ) #class_weight="balanced" classifier = make_pipeline(undersample, svm) # Prediction and evaluation classifier.fit(X_train_n, Y_train) prediction = classifier.predict(X_val_n) print("\n", classification_report(prediction, Y_val)) print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction)) return classifier, scalar
def one_cv_for_one_algo(algorithm, X_train, y_train): clf = make_pipeline(SMOTE(random_state=0), algorithm) clf.fit(X_train, y_train) del X_train, y_train # For Testing # X_test fn = 'mem_file_X_test_' + str(cv_idx) + '.dat' mem_file_name = make_path(fn, directory='') X_test = read_memmap(mem_file_name) print('X_test loaded') # y_test fn = 'mem_file_y_test_' + str(cv_idx) + '.dat' mem_file_name = make_path(fn, directory='') y_test = read_memmap(mem_file_name) print('y_test loaded') y_pred = clf.predict(X_test) del X_test s1 = accuracy_score(y_true=y_test, y_pred=y_pred) s2 = precision_score(y_true=y_test, y_pred=y_pred) s3 = recall_score(y_true=y_test, y_pred=y_pred) s4 = f1_score(y_true=y_test, y_pred=y_pred) del y_test print('accuracy:', s1) print('precision:', s2) print('recall:', s3) print('f1:', s4) return [s1, s2, s3, s4]
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) with raises(AttributeError): getattr(reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(solver='lbfgs', multi_class='auto', random_state=0)) with raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( X_train, y_train) clf2 = make_pipeline(RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0)).fit( X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_resampler_last_stage_passthrough(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0) rus = RandomUnderSampler(random_state=42) pipe = make_pipeline(rus, None) pipe.fit_resample(X, y)
def test_pipeline_none_sampler_sample(): # Test pipeline using None step and a sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus) pipe.fit_resample(X, y)
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0).fit(X_train, y_train) clf2 = make_pipeline(RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier()).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_pipeline_none_classifier(): # Test pipeline using None as preprocessing step and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) clf = LogisticRegression(random_state=0) pipe = make_pipeline(None, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y)
def test_pipeline_none_transformer(): # Test pipeline using None and a transformer that implements transform and # inverse_transform X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) pca = PCA(whiten=True) pipe = make_pipeline(None, pca) pipe.fit(X, y) X_trans = pipe.transform(X) X_inversed = pipe.inverse_transform(X_trans) assert_array_almost_equal(X, X_inversed)
from imblearn import over_sampling as os from imblearn import pipeline as pl from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Generate a dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) # Train the classifier with balancing pipeline.fit(X_train, y_train) # Test the classifier and get the prediction y_pred_bal = pipeline.predict(X_test) # Show the classification report print(classification_report_imbalanced(y_test, y_pred_bal))
random_state=rng, behaviour='new') model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') ############################################################################## # Integrate it within a pipeline ############################################################################## ############################################################################## # By elimnating outliers before the training, the classifier will be less # affected during the prediction. pipe = make_pipeline(FunctionSampler(func=outlier_rejection), LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng) y_pred = clf.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) plt.show()
print(__doc__) RANDOM_STATE = 42 scorer = metrics.make_scorer(metrics.cohen_kappa_score) # Generate the dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) smote = os.SMOTE(random_state=RANDOM_STATE) cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE) pipeline = pl.make_pipeline(smote, cart) param_range = range(1, 11) train_scores, test_scores = ms.validation_curve( pipeline, X, y, param_name="smote__k_neighbors", param_range=param_range, cv=3, scoring=scorer, n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.plot(param_range, test_scores_mean, label='SMOTE') ax.fill_between(param_range, test_scores_mean + test_scores_std,
def test_X1d_inverse_transform(): transformer = TransfT() pipeline = make_pipeline(transformer) X = np.ones(10) msg = "1d X will not be reshaped in pipeline.inverse_transform" assert_warns_message(FutureWarning, msg, pipeline.inverse_transform, X)
RepeatedEditedNearestNeighbours) print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Create the samplers enn = EditedNearestNeighbours() renn = RepeatedEditedNearestNeighbours() # Create the classifier knn = KNN(1) # Make the splits X_train, X_test, y_train, y_test = tts(X, y, random_state=42) # Add one transformers and two samplers in the pipeline object pipeline = make_pipeline(pca, enn, renn, knn) pipeline.fit(X_train, y_train) y_hat = pipeline.predict(X_test) print(classification_report(y_test, y_hat))
y = data.target[idxs] y[y == majority_person] = 0 y[y == minority_person] = 1 classifier = ['3NN', neighbors.KNeighborsClassifier(3)] samplers = [ ['Standard', DummySampler()], ['ADASYN', ADASYN(random_state=RANDOM_STATE)], ['ROS', RandomOverSampler(random_state=RANDOM_STATE)], ['SMOTE', SMOTE(random_state=RANDOM_STATE)], ] pipelines = [ ['{}-{}'.format(sampler[0], classifier[0]), make_pipeline(sampler[1], classifier[1])] for sampler in samplers ] fig = plt.figure() ax = fig.add_subplot(1, 1, 1) for name, pipeline in pipelines: mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for train, test in cv.split(X, y): probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr)
X_test = np.vstack([moons, blobs]) y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8), np.zeros(blobs.shape[0], dtype=np.int8)]) plot_scatter(X_test, y_test, 'Testing dataset') def outlier_rejection(X, y): model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') pipe = make_pipeline(FunctionSampler(func=outlier_rejection), LogisticRegression(random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) clf = LogisticRegression(random_state=rng) y_pred = clf.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) plt.show()
# does not have any knowledge regarding the underlying distribution. Therefore, # some noisy samples can be generated, e.g. when the different classes cannot # be well separated. Hence, it can be beneficial to apply an under-sampling # algorithm to clean the noisy samples. Two methods are usually used in the # literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning # methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek`` # and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than # ``SMOTETomek``. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7)) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, ( SMOTE(random_state=0), SMOTEENN(random_state=0), SMOTETomek(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format( sampler.__class__.__name__)) fig.tight_layout() plt.show()
############################################################################### # Random over-sampling to balance the data set ############################################################################### ############################################################################### # Random over-sampling can be used to repeat some samples and balance the # number of samples between the dataset. It can be seen that with this trivial # approach the boundary decision is already less biaised toward the majority # class. fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7)) X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94)) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) pipe = make_pipeline(RandomOverSampler(random_state=0), LinearSVC()) pipe.fit(X, y) plot_decision_function(X, y, pipe, ax2) ax2.set_title('Decision function for RandomOverSampler') fig.tight_layout() ############################################################################### # More advanced over-sampling using ADASYN and SMOTE ############################################################################### ############################################################################### # Instead of repeating the same samples when over-sampling, we can use some # specific heuristic instead. ADASYN and SMOTE can be used in this case. # Make an identity sampler
from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss from imblearn.pipeline import make_pipeline from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE) print('Training target statistics: {}'.format(Counter(y_train))) print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))