def __build_model(self): std_scaler = StandardScaler() smt = SMOTE(k_neighbors=3, random_state=42, sampling_strategy='minority') if self.model_hyperparameters: log_reg_sm = LogisticRegression(**self.model_hyperparameters) else: log_reg_sm = LogisticRegression() pipeline = imbalanced_make_pipeline( std_scaler, smt, log_reg_sm, ) self.model = pipeline
def get_models(clf_list, X,y, sampling_method = RandomUnderSampler(sampling_strategy='majority'), ): classifiers = { "LogisticRegression": LogisticRegression(max_iter=500, n_jobs=-1), "XGBClassifier": XGBClassifier(n_jobs=-1, n_estimators=1000, max_depth=10),#, **{'gpu_id': 0, 'tree_method': 'gpu_hist'}), "KNeighborsClassifier" : KNeighborsClassifier(3, n_jobs=-1), "SVC" : SVC(gamma=2, C=1), "GaussianProcessClassifier" : GaussianProcessClassifier(1.0 * RBF(1.0)), "DecisionTreeClassifier" : DecisionTreeClassifier(max_depth=5), "RandomForestClassifier" : RandomForestClassifier(max_depth=5, n_estimators=100, n_jobs=-1), "MLPClassifier" : MLPClassifier(max_iter=1000), "AdaBoostClassifier" : AdaBoostClassifier(), "LGBMClassifier": LGBMClassifier( boosting_type= 'gbdt', max_depth = 10, objective= 'binary', nthread= 5, num_leaves= 32, learning_rate= 0.05, max_bin= 512, subsample_for_bin= 200, subsample= 0.7, subsample_freq= 1, colsample_bytree= 0.8, reg_alpha= 20, reg_lambda= 20, min_split_gain= 0.5, min_child_weight= 1, min_child_samples= 10, scale_pos_weight= 1, num_class = 1, metric = 'auc') } clf = {} for i, clf_name in enumerate(clf_list): # print("%i/%i : Training %s"%(i, len(clf_list), clf_name), end="") # classifiers[clf_name].fit(X,y) clf[clf_name] = imbalanced_make_pipeline(sampling_method, classifiers[clf_name]) # print("[OK]") return clf
def training(): test = pd.read_csv('./csv/preprocess/missing_test.csv', index_col=0) train = pd.read_csv('./csv/preprocess/missing_train.csv', index_col=0) train_label = pd.read_csv('./csv/base/train_label.csv', index_col=0) if var_env.knn_pickle == None: clf = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform') pipeline = imbalanced_make_pipeline( SMOTE(sampling_strategy='minority'), clf) var_env.knn_pickle = pipeline.fit(train.values, train_label) result = pd.DataFrame({'predict_label': var_env.knn_pickle.predict(test)}) result.to_csv('./csv/result.csv', index=False) return True
def training(): data = pd.read_csv('./csv/dataset.csv', index_col='id', low_memory=False) original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split( data, data['label'], test_size=0.2, random_state=42) num_round = 10 kfold = StratifiedKFold(n_splits=num_round, random_state=None, shuffle=False) id = 0 split_data = {} for train_id, test_id in kfold.split(original_Xtrain, original_ytrain): split_data['train' + str(id)] = train_id split_data['valid' + str(id)] = test_id id += 1 f7_array = original_Xtrain['FIELD_7'].apply( lambda x: '[]' if x != x else x).apply(literal_eval) original_Xtrain['FIELD_7'] = f7_array.apply(len) f7_array = original_Xtest['FIELD_7'].apply( lambda x: '[]' if x != x else x).apply(literal_eval) original_Xtest['FIELD_7'] = f7_array.apply(len) original_Xtrain_, original_Xtrain_label_, original_Xtest = preprocessing( original_Xtrain, original_Xtest) num_round = 10 process_data = {} for id in range(num_round): process_data['train' + str(id)], process_data[ 'train_label' + str(id)], process_data['valid' + str(id)] = preprocessing( original_Xtrain.iloc[split_data['train' + str(id)]], original_Xtrain.iloc[split_data['valid' + str(id)]]) result_valid = {} # save result result_orginal = {} clssifers = { "KNearest": KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform'), "LogisticRegression": LogisticRegression(max_iter=1000000, penalty='l2'), "DecisionTreeClassifier": DecisionTreeClassifier() } for key, clf in clssifers.items(): average_original_test = 0.0 average_valid_test = 0.0 average_private_test = 0.0 print(key) for id in range(0, num_round): idx = split_data['valid' + str(id)] pipeline = imbalanced_make_pipeline( SMOTE(sampling_strategy='minority'), clf) model = pipeline.fit(process_data['train' + str(id)], process_data['train_label' + str(id)]) score_valid_test = model.predict(process_data['valid' + str(id)]) score_original_test = model.predict(original_Xtest) average_valid_test += score_valid_test / num_round average_original_test += score_original_test / num_round valid = transform_average_result(average_valid_test) orginal = transform_average_result(average_original_test) result_valid[key] = metrics.confusion_matrix(valid, original_ytrain.iloc[idx]) result_orginal[key] = metrics.confusion_matrix(orginal, original_ytest) pickle.dump(clssifers['KNearest'], open('./csv/model/KNearest.pkl', 'wb')) pickle.dump(clssifers['LogisticRegression'], open('./csv/model/LogisticRegression.pkl', 'wb')) pickle.dump(clssifers['DecisionTreeClassifier'], open('./csv/model/DecisionTreeClassifier.pkl', 'wb')) return True
accuracy_lst = [] precision_lst = [] recall_lst = [] f1_lst = [] auc_lst = [] # Classifier with optimal parameters # log_reg_sm = grid_log_reg.best_estimator_ log_reg_sm = LogisticRegression() rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4) # Implementing SMOTE Technique # Cross Validating the right way # Parameters log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} for train, test in stratified_fold.split(original_Xtrain, original_ytrain): pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before.. model = pipeline.fit(original_Xtrain[train], original_ytrain[train]) best_est = rand_log_reg.best_estimator_ prediction = best_est.predict(original_Xtrain[test]) accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test])) precision_lst.append(precision_score(original_ytrain[test], prediction)) recall_lst.append(recall_score(original_ytrain[test], prediction)) f1_lst.append(f1_score(original_ytrain[test], prediction)) auc_lst.append(roc_auc_score(original_ytrain[test], prediction)) print('---' * 45) print('') print("accuracy: {}".format(np.mean(accuracy_lst))) print("precision: {}".format(np.mean(precision_lst))) print("recall: {}".format(np.mean(recall_lst)))
Y_undersample_train, Y_undersample_test = Y_undersample.iloc[ train_index], Y_undersample.iloc[test_index] X_undersample_train = X_undersample_train.values X_undersample_test = X_undersample_test.values Y_undersample_train = Y_undersample_train.values Y_undersample_test = Y_undersample_test.values # %% undersample_accuracy = [] undersample_precision = [] undersample_recall = [] undersample_f1 = [] undersample_auc = [] # %% for train_index, test_index in SKfold.split(X_undersample_train, Y_undersample_train): undersample_pipeline = imbalanced_make_pipeline( NearMiss(sampling_strategy="majority"), log_reg) undersample_model = undersample_pipeline.fit( X_undersample_train[train_index], Y_undersample_train[train_index]) undersample_prediction = undersample_model.predict( X_undersample_train[test_index]) undersample_accuracy.append( undersample_pipeline.score(og_X_train[test_index], og_Y_train[test_index])) undersample_precision.append( precision_score(Y_undersample_train[test_index], undersample_prediction)) undersample_recall.append( recall_score(Y_undersample_train[test_index], undersample_prediction)) undersample_f1.append( f1_score(Y_undersample_train[test_index], undersample_prediction)) undersample_auc.append(
undersample_accuracy = [] undersample_precision = [] undersample_recall = [] undersample_f1 = [] undersample_auc = [] # Implementing NearMiss Technique # Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables) X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values) print 'NearMiss label distribution:', Counter(y_nearmiss) for train, test in sss.split(undersample_Xtrain, undersample_ytrain): ''' Why is it still called 'undersample'?! ''' undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # TODO: SMOTE happens during Cross Validation not before... undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train]) undersample_prediction = undersample_model.predict(undersample_Xtrain[test]) undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test])) undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction)) undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction)) undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction)) undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction)) from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import learning_curve f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 14), sharey=True)
undersample_ytest = undersample_ytest.values undersample_accuracy = [] undersample_precision = [] undersample_recall = [] undersample_f1 = [] undersample_auc = [] # Implementing NearMiss Technique (or Undersampling) # Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables) X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values) print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss))) # Cross Validating for train, test in sss.split(undersample_Xtrain, undersample_ytrain): undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before.. undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train]) undersample_prediction = undersample_model.predict(undersample_Xtrain[test]) undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test])) undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction)) undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction)) undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction)) undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction)) # Let's Plot LogisticRegression Learning Curve from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import learning_curve def plot_learning_curve(estimator1, estimator2, estimator3, estimator4, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
def ClassifierTesting(method_name, method, method_prop, sampling_algoritm): acc_lst = [] prec_lst = [] rec_lst = [] f1_lst = [] auc_lst = [] random_search = RandomizedSearchCV(method, method_prop, n_iter=20) for train, test in strat_cross_val.split(original_train_data, original_train_index): cross_val_model = imbalanced_make_pipeline( sampling_algoritm, random_search ) # SMOTE happens during Cross Validation not before.. cross_val_model.fit(original_train_data[train], original_train_index[train]) best_est = random_search.best_estimator_ prediction = best_est.predict(original_train_data[test]) acc_lst.append( cross_val_model.score(original_train_data[test], original_train_index[test])) prec_lst.append(precision_score(original_train_index[test], prediction)) rec_lst.append(recall_score(original_train_index[test], prediction)) f1_lst.append(f1_score(original_train_index[test], prediction)) auc_lst.append(roc_auc_score(original_train_index[test], prediction)) print('_' * 50) print(method_name) print('_' * 50) print('Результат перекрестной проверки:') print("accuracy:{0:.3f}".format(np.mean(acc_lst) * 100), '%') print("precision:{0:.3f}".format(np.mean(prec_lst) * 100), '%') print("recall:{0:.3f}".format(np.mean(rec_lst) * 100), '%') print("f1:{0:.3f}".format(np.mean(f1_lst) * 100), '%') print("Roc Auc:{0:.3f}".format(np.mean(auc_lst) * 100), '%') print('_' * 50) print('_' * 50) acc_lst = [] prec_lst = [] rec_lst = [] f1_lst = [] auc_lst = [] prediction = best_est.predict(original_test_data) acc_lst.append(accuracy_score(original_test_index, prediction)) prec_lst.append(precision_score(original_test_index, prediction)) rec_lst.append(recall_score(original_test_index, prediction)) f1_lst.append(f1_score(original_test_index, prediction)) auc_lst.append(roc_auc_score(original_test_index, prediction)) print('Результат тестирования:') print("accuracy:{0:.3f}".format(np.mean(acc_lst) * 100), '%') print("precision:{0:.3f}".format(np.mean(prec_lst) * 100), '%') print("recall:{0:.3f}".format(np.mean(rec_lst) * 100), '%') print("f1:{0:.3f}".format(np.mean(f1_lst) * 100), '%') print("Roc Auc:{0:.3f}".format(np.mean(auc_lst) * 100), '%') print('_' * 50) cm = confusion_matrix(original_test_index, prediction) fig, ax = plt.subplots(1, 1, figsize=(5, 5)) sns.heatmap(cm, ax=ax, annot=True, cmap=plt.cm.Purples) ax.set_title(method_name, fontsize=18) ax.set_xticklabels(['Честные', 'Мошеннические'], fontsize=10, rotation=0) ax.set_yticklabels(['Честные', 'Мошеннические'], fontsize=10, rotation=90) ax.set_xlabel(('Предсказанные значения'), fontsize=12, rotation=0) ax.set_ylabel(('Истинные значения'), fontsize=12, rotation=90) plt.show() FPR, TPR, none = roc_curve(original_test_index, prediction) custom_roc_curve(FPR, TPR, method_name)
def cv_grid_search(sampling, X, y, model_indices=False): log_reg_params = { "penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } knears_params = { "n_neighbors": list(range(2, 5, 1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } svc_params = { 'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'] } tree_params = { "criterion": ["gini", "entropy"], "max_depth": list(range(2, 4, 1)), "min_samples_leaf": list(range(5, 7, 1)) } grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params) grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params) grid_svc = GridSearchCV(SVC(), svc_params) grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params) models = np.array([grid_log_reg, grid_knears, grid_svc, grid_tree]) best_ests = [] if model_indices: models = models[model_indices] for model in models: print("Model: " + str(model)) # prepare initial train and test splitter = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) for train_index, test_index in splitter.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Turn into an array X_train, X_test = X_train.values, X_test.values y_train, y_test = y_train.values, y_test.values # List to append the score and then find the average accuracy_lst, precision_lst = [], [] recall_lst, f1_lst = [], [] auc_lst = [] for train, test in splitter.split(X_train, y_train): pipeline = imbalanced_make_pipeline(sampling, model) model = pipeline.fit(X_train[train], y_train[train]) best_est = model.best_estimator_ prediction = best_est.predict(X_train[test]) accuracy_lst.append(pipeline.score(X_train[test], y_train[test])) precision_lst.append(precision_score(y_train[test], prediction)) recall_lst.append(recall_score(y_train[test], prediction)) f1_lst.append(f1_score(y_train[test], prediction)) auc_lst.append(roc_auc_score(y_train[test], prediction)) print("accuracy: {}".format(np.mean(accuracy_lst))) print("precision: {}".format(np.mean(precision_lst))) print("recall: {}".format(np.mean(recall_lst))) print("f1: {}".format(np.mean(f1_lst))) print("AUC: {}".format(np.mean(auc_lst))) best_ests.append(best_est) return best_ests