def main(path, began, is_cent_data=0, iterations=30, temperature=5, attractive_force=1, repulsive_force=0.4, speed=0.02, k=0.5): # 读取数据 # 读取df类型的归一化数据 my_data = tool.unitilize_data(tool.read_KEEL_data(path, began)) # 准备训练数据和测试数据 # 使用fr模型和smote模型数理数据,形成新的数据 # 使用处理完成的数据,进行建模,并预测 # 是否对数据进行中心化处理 if is_cent_data != 0: cent_point = get_cent_point(my_data) else: cent_point = np.array(my_data) fr_data = pd.DataFrame(fr(cent_point,iterations, temperature, attractive_force, repulsive_force, speed, k)) fr_data_x = fr_data.iloc[:, 0:-1] fr_data_y = fr_data.iloc[:, -1] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 随机抽取测试数据的训练数据 X_train, X_test, Y_train, Y_test = train_test_split(fr_data_x, fr_data_y, test_size=0.3, random_state=42) # 根据抽取的训练数据,生成平衡过后的数据集 my_B_SMOTE = B_SMOTE() fr_data_x_smote, fr_data_y_smote = my_B_SMOTE.fit_sample(X_train, Y_train) #经过处理的数数据 print("经过处理的数据:") train(fr_data_x_smote, fr_data_y_smote, X_test, Y_test) #未经处理的数据 X_train_org, X_test_org, Y_train_org, Y_test_org = train_test_split(my_data.iloc[:, 0:-1], my_data.iloc[:, -1], test_size=0.3, random_state=42) data_x_org, data_y_org = my_B_SMOTE.fit_sample(X_train_org, Y_train_org) print("未处理的数据:") train(data_x_org, data_y_org, X_test_org, Y_test_org)
def oversample_with_smote(x_train, y_train, iterator=10): ''' SMOTE를 이용하여 데이터를 oversampling 해줌. :param x_train: 모델에 입력되는 데이터 :param y_train: 모델이 예측할 타겟 :param iterator: sampling 반복 정도 :return: oversampling 된 X, Y ''' sm = BorderlineSMOTE() x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_fin = [] y_train_fin = [] for i in range(iterator): temp_x = [] temp_y = [] indexes = list(range(len(y_train_sm))) random.shuffle(indexes) cnt = 0 max_cnt = len(y_train_sm) // 10 for j in indexes: x = x_train_sm[j] y = y_train_sm[j] if y == i % 2: temp_x.append(x) temp_y.append(y) elif cnt < max_cnt: temp_x.append(x) temp_y.append(y) cnt += 1 x_sm_new, y_sm_new = sm.fit_sample(temp_x, temp_y) x_train_fin.extend(x_sm_new) y_train_fin.extend(y_sm_new) return x_train_fin, y_train_fin
def test_borderline_smote(kind): bsmote = BorderlineSMOTE(kind=kind, random_state=42) bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11)) X_res_1, y_res_1 = bsmote.fit_sample(X, Y) X_res_2, y_res_2 = bsmote_nn.fit_sample(X, Y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def del_set_smote_data(self): """ 学習データのSMOTE処理を行い学習データを更新する """ # 対象数が少ない場合はサンプリングレートを下げる positive_count_train = self.y_train.sum() negative_count_train = len(self.y_train) - positive_count_train print("check y_train value 0:" + str(negative_count_train) + " 1:" + str(positive_count_train)) if positive_count_train >= 6: smote = BorderlineSMOTE() self.X_train, self.y_train = smote.fit_sample( self.X_train, self.y_train) else: print("----- RandomOverSampler ----- ") ros = RandomOverSampler( # ratio={1: self.X_train.shape[0], 0: self.X_train.shape[0] // 3}, random_state=71) ratio={ 1: negative_count_train, 0: negative_count_train }, random_state=71) # 学習用データに反映 self.X_train, self.y_train = ros.fit_sample( self.X_train, self.y_train) print("-- after sampling: " + str(np.unique(self.y_train, return_counts=True)))
def Smote_bd( data, label): #样本的近邻至少有一半是其他类,(此时样本被称为危险样本)最近邻中的随机样本b与该少数类样本a来自于不同的类 from imblearn.over_sampling import BorderlineSMOTE smote = BorderlineSMOTE(random_state=0) data_smote_bd, label_smote_bd = smote.fit_sample(data, label) return data_smote_bd, label_smote_bd
def oversample_remainingSet(self, instances, labels, kind='borderline-1'): """oversamples remaining set (using BorderlineSMOTE) after a drift is detected.""" if len(np.unique(labels)) >= 2: minority_class = collections.Counter(labels.tolist()).most_common()[-1][0] if np.sum(labels == minority_class) > self.n_neighbors: oversample = BorderlineSMOTE(k_neighbors=self.n_neighbors, m_neighbors=5, kind=kind, random_state=self.random_state) instances, labels = oversample.fit_sample(instances, labels) return instances, labels
def classification(self,X,Y): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) #text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())]) vectorizer = TfidfVectorizer() # vectorizer2 = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) sm = BorderlineSMOTE() X_res, Y_res = sm.fit_sample(X_train_tfidf, y_train) clf = MultinomialNB() clf.fit(X_res, Y_res) prediction = clf.predict(X_test_tfidf) print(prediction) final_time = start_time - datetime.datetime.now() print(final_time) print(metrics.classification_report(y_test,prediction)) print(metrics.roc_auc_score(y_test, prediction))
def _SMOTE_Border(self): # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique print("before SMOTE df", self.x_train.shape) smote = BorderlineSMOTE( k_neighbors=5, m_neighbors=5, random_state=self.seed ) # sampling_strategy=0.8 self.X_train_smote, self.y_train_smote = smote.fit_sample( self.x_train, self.y_train ) print("X_train_SMOTE:\n", self.X_train_smote[1]) self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"] ) print("len smote: \n", len(self.X_train_smote)) print("len new x_train: \n", len(self.x_train)) number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1] print("number positive responses y_train:\n", len(number_pos_x))
def classify_by_region(data_frame): X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1) # Features - drop region, class y = data_frame[TOP_LEVEL_TARGET] # Labels # get_feature_correlations(data_frame, plot=True, return_resulst=False) # mutual_info = mutual_info_classif(X, y, discrete_features='auto') # print("mutual_info: ", mutual_info) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True) ########## Handle Class Imabalnce ######### sm = BorderlineSMOTE() X_resampled, y_resampled = sm.fit_sample(X_train, y_train) print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size()) ############################################################################### # 4. Scale data # ############################################################################### # sc = StandardScaler() # X_resampled = sc.fit_transform(X_resampled) # X_test = sc.transform(X_test) # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/ # categorical feature selection # sf = SelectKBest(chi2, k='all') # sf_fit = sf.fit(X_train, y_train) # # print feature scores # for i in range(len(sf_fit.scores_)): # print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i])) # # # plot the scores # datset = pd.DataFrame() # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))] # datset['scores'] = sf_fit.scores_ # datset = datset.sort_values(by='scores', ascending=True) # sns.barplot(datset['scores'], datset['feature'], color='blue') # sns.set_style('whitegrid') # plt.ylabel('Categorical Feature', fontsize=18) # plt.xlabel('Score', fontsize=18) # # plt.show() # sel_chi2 = SelectKBest(chi2, k='all') # chi 10 - 0.64, 0.63, 0.60 X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) X_test_chi2 = sel_chi2.transform(X_test) # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42)) # Spot Check Algorithms # spot_check_algorithms(X_resampled, y_resampled) # models = [SVC(kernel='poly'), RandomForestClassifier(), GradientBoostingClassifier()] # for i in range(len(models)): # # Get the final model # parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 # # # Train the final model # parent_model.fit(X_resampled, y_resampled) # # # Evaluate the final model on the training set # predictions = parent_model.predict(X_resampled) # print_evaluation_results(y_resampled, predictions) # # # Evaluate the final model on the test set # predictions = parent_model.predict(X_test) # print_evaluation_results(y_test, predictions, train=False) # pipeline = Pipeline( # [ # # ('selector', SelectKBest(f_classif)), # ('model', RandomForestClassifier(n_jobs = -1) ) # ] # ) # # # Perform grid search on the classifier using f1 score as the scoring method # grid_obj = GridSearchCV( # estimator= GradientBoostingClassifier(), # param_grid={ # # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], # 'n_estimators': [10, 20, 30], # 'max_depth': [6, 10, 20, 30], # # 'max_depth': [1, 10, 20, 30], # 'min_samples_split': [1, 10, 100] # # 'model__n_estimators': np.arange(10, 200, 10) # # 'C': [1, 10, 100] # }, # # n_jobs=-1, # scoring="f1_micro", # cv=5, # verbose=3 # ) # # # Fit the grid search object to the training data and find the optimal parameters # grid_fit = grid_obj.fit(X_resampled, y_resampled) # # Get the best estimator # best_clf = grid_fit.best_estimator_ # print(best_clf) # Get the final model parent_model = SVC(kernel = 'rbf', C = 10)#KNN(n_neighbors = 7)-0.52 # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 t0 = time() # Train the final model parent_model.fit(X_resampled, y_resampled) print("training time:", round(time() - t0, 3), "s") # Evaluate the final model on the training set train_predictions = parent_model.predict(X_resampled) print_evaluation_results(y_resampled, train_predictions) t0 = time() # Evaluate the final model on the test set test_predictions = parent_model.predict(X_test) print("predicting time:", round(time() - t0, 3), "s") print_evaluation_results(y_test, test_predictions, train=False) confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test) # Plot normalized confusion matrix # fig = plt.figure() # fig.set_size_inches(8, 8, forward=True) # # fig.align_labels() # plot_confusion_matrix(cnf_matrix, classes=["1", "2", "3", "4"], normalize=False, title='Normalized confusion matrix') # probs = parent_model.predict_proba(X_test) # print("Prediction probabilities for Region\n", probs) # plotConfusionMatrix(X_test, y_test, ['1', '2', '3', '4']) joblib.dump(parent_model, filename='../resources/models/parent_classifier.pkl')
def runSMOTEvariationsGen(self, folder): """ Create files with SMOTE preprocessing and without preprocessing. :param datasets: datasets. :param folder: cross-validation folders. :return: """ smote = SMOTE() borderline1 = BorderlineSMOTE(kind='borderline-1') borderline2 = BorderlineSMOTE(kind='borderline-2') smoteSVM = SVMSMOTE() geometric_smote = GeometricSMOTE(n_jobs=-1) for dataset in datasets: # biclass e multiclass for fold in range(5): path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"])) train = np.genfromtxt(path, delimiter=',') X = train[:, 0:train.shape[1] - 1] Y = train[:, train.shape[1] - 1] # SMOTE print("SMOTE..." + dataset) X_res, y_res = smote.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])), header=False, index=False) # SMOTE BORDERLINE1 print("Borderline1..." + dataset) X_res, y_res = borderline1.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])), header=False, index=False) # SMOTE BORDERLINE2 print("Borderline2..." + dataset) X_res, y_res = borderline2.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])), header=False, index=False) # SMOTE SVM print("SMOTE SVM..." + dataset) X_res, y_res = smoteSVM.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])), header=False, index=False) # GEOMETRIC SMOTE print("GEOMETRIC SMOTE..." + dataset) X_res, y_res = geometric_smote.fit_resample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])), header=False, index=False)
activation='relu')) model.add( Dense(2, kernel_initializer='random_normal', activation='softmax')) # compile the keras model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train_res, y_train_resv2, epochs=100, batch_size=36) _, accuracy = model.evaluate(X_test, y_testv2) acc1.append(accuracy) smborder = BorderlineSMOTE(sampling_strategy=class_dist) X_train_res, y_train_res = smborder.fit_sample(X_train, y_train) X_train_res, y_train_res = shuffle(X_train_res, y_train_res) y_train_resv2 = ohe.fit_transform(y_train_res).toarray() y_testv2 = ohe.fit_transform(y_test).toarray() y_train_resv2 = pd.DataFrame(y_train_resv2) y_testv2 = pd.DataFrame(y_testv2) model = Sequential() model.add( Dense(20, kernel_initializer='random_normal', input_dim=list(X_train.shape)[1], activation='relu')) model.add( Dense(75, kernel_initializer='random_normal',
print("Before OverSampling, the shape of X_train: {}".format(X_train.shape)) # SMOTE 적용 이전 데이터 형태 print("Before OverSampling, the shape of y_train: {}".format(y_train.shape)) # SMOTE 적용 이전 데이터 형태 print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape)) # SMOTE 적용 결과 확인 print('After OverSampling, the shape of y_train: {} \n'.format(y_train_res.shape)) # # SMOTE 적용 결과 확인 lgbm_clf2 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier lgbm_clf2.fit(X_train_res, y_train_res) # 학습 진행 y_pred2 = lgbm_clf2.predict(X_test) # 평가 데이터셋 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred2)) # 혼돈행렬 print('\n') print("Model Evaluation Result: \n", classification_report(y_test, y_pred2)) # 전체적인 성능 평가 # BLSM (Borderline SMOTE) from imblearn.over_sampling import BorderlineSMOTE sm2 = BorderlineSMOTE(random_state = 42) # BLSM 알고리즘 적용 X_train_res2, y_train_res2 = sm2.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용 lgbm_clf3 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier lgbm_clf3.fit(X_train_res2, y_train_res2) # 학습 진행 y_pred3 = lgbm_clf3.predict(X_test) # 평가 데이터셋 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred3)) # 혼돈행렬 print('\n') print("Model Evaluation Result: \n", classification_report(y_test, y_pred3)) # 전체적인 성능 평가 # SVMSMOTE from imblearn.over_sampling import SVMSMOTE sm3 = SVMSMOTE(random_state = 42) # SVMSMOTE 알고리즘 적용 X_train_res3, y_train_res3 = sm3.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용 lgbm_clf4 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier lgbm_clf4.fit(X_train_res3, y_train_res3) # 학습 진행 y_pred4 = lgbm_clf4.predict(X_test) # 평가 데이터셋 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred4)) # 혼돈행렬
print('Before: Class{}'.format(Counter(kelas))) df = dataset.copy() del df[21] x_train, x_test, y_train, y_test = train_test_split(df, kelas, test_size=.1, random_state=10) #balancing data #sm = SMOTETomek() #sm = SMOTE(random_state=42) from imblearn.over_sampling import BorderlineSMOTE sm = BorderlineSMOTE(random_state=42) df_resm, kelas_res = sm.fit_sample(df, kelas) from imblearn.under_sampling import TomekLinks tl = TomekLinks() df_resm2, kelas_res2 = tl.fit_sample(df_resm, kelas_res) print('After: Class{}'.format(Counter(kelas_res))) #df_res_vis = pca.transform(df_resm) besar = dataset.groupby(kelas).size() besar = list(besar) koor_x = ['false', 'true'] koor_y = besar kelas_res = list(kelas_res) valp = kelas_res.count(False) valn = kelas_res.count(True) new_y = []
'GB': [[0, 0], [0, 0]] } svm_sm_scores = {'LR': 0, 'AB': 0, 'GB': 0} svm_sm_con_mat = { 'LR': [[0, 0], [0, 0]], 'AB': [[0, 0], [0, 0]], 'GB': [[0, 0], [0, 0]] } for train_index, test_index in skf.split(features, target): # Borderline Smote bl_smote = BorderlineSMOTE(random_state=0, kind='borderline-1') X_train, y_train = bl_smote.fit_sample(features[train_index], target[train_index]) # Logistic Regression logistic = LogisticRegression(random_state=0) logistic.fit(X_train, y_train) res = logistic.predict(features[test_index]) bl_smote_scores['LR'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res) # Ada Boost Classifier adaBoost = AdaBoostClassifier(random_state=0) adaBoost.fit(X_train, y_train) res = adaBoost.predict(features[test_index]) bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
def classify_by_region(data_frame): X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1) # Features - drop region, class y = data_frame[TOP_LEVEL_TARGET] # Labels # ['age', 'degree-of-diffe', 'sex_2', 'histologic-type_2', 'bone_2', # 'neck_2', 'mediastinum_2', 'abdominal_2'] # data_frame.drop(['lung_2', 'pleura_2', 'peritoneum_2', 'liver_2', 'brain_2', 'skin_2', 'supraclavicular_2', # 'axillar_2', 'bone-marrow_2'], axis=1, inplace=True) # get_feature_correlations(data_frame, plot=True, return_resulst=False) mutual_info = mutual_info_classif(X, y, discrete_features='auto') print("mutual_info: ", mutual_info) # 0.3 test size = 0.56 f1 # 0.2 test size = 0.61 f1 X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True) # reject_sampler = FunctionSampler(func=outlier_rejection) # X_train, y_train = reject_sampler.fit_resample(X_train, y_train) # Baseline # Spot Check Algorithms # spot_check_algorithms(X_resampled, y_resampled) ########## Handle Class Imabalnce ######### sm = BorderlineSMOTE(random_state=42) X_resampled, y_resampled = sm.fit_sample(X_train, y_train) print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size()) # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/ # categorical feature selection sf = SelectKBest(f_classif, k='all') sf_fit = sf.fit(X_resampled, y_resampled) # print feature scores for i in range(len(sf_fit.scores_)): print(' %s: %f' % (X_resampled.columns[i], sf_fit.scores_[i])) # plot the scores # datset = pd.DataFrame() # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))] # datset['scores'] = sf_fit.scores_ # datset = datset.sort_values(by='scores', ascending=True) # sns.barplot(datset['scores'], datset['feature'], color='blue') # sns.set_style('whitegrid') # plt.ylabel('Categorical Feature', fontsize=18) # plt.xlabel('Score', fontsize=18) # plt.show() # models = [SVC(kernel='poly'), RandomForestClassifier(), GradientBoostingClassifier()] # for i in range(len(models)): # # Get the final model # parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 # # # Train the final model # parent_model.fit(X_resampled, y_resampled) # # # Evaluate the final model on the training set # predictions = parent_model.predict(X_resampled) # print_evaluation_results(y_resampled, predictions) # # # Evaluate the final model on the test set # predictions = parent_model.predict(X_test) # print_evaluation_results(y_test, predictions, train=False) model = RandomForestClassifier(RANDOM_STATE) ########################################### Hyper-parameter Tuning ########################################## best_clf_rf = tune_random_forest(model, X_resampled, y_resampled) # g = GridSearchCV( # estimator=GradientBoostingClassifier(), # param_grid={ # "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] , # "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15], # "min_child_weight" : [ 1, 3, 5, 7 ], # "gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], # "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] # }, # n_jobs=-1, # scoring="f1_micro", # cv=5, # verbose=1 # ) # # Fit the grid search object to the training data and find the optimal parameters # grid_fit = grid_obj.fit(X_resampled, y_resampled) # # # Get the best estimator # best_clf_gb= grid_fit.best_estimator_ # print(best_clf_gb) ########################################### Final Model ########################################### parent_model = best_clf_rf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 t0 = time() # Train the final model parent_model.fit(X_resampled, y_resampled) print("training time:", round(time() - t0, 3), "s") # Evaluate the final model on the training set train_predictions = parent_model.predict(X_resampled) print_evaluation_results(y_resampled, train_predictions) t0 = time() # Evaluate the final model on the test set test_predictions = parent_model.predict(X_test) print("predicting time:", round(time() - t0, 3), "s") print_evaluation_results(y_test, test_predictions, train=False) confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test) # https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f sel = SelectFromModel(best_clf_rf) sel.fit(X_resampled, y_resampled) print(sel.get_support()) selected_feat = X_resampled.columns[(sel.get_support())] print(len(selected_feat)) print(selected_feat)
from sklearn.model_selection import StratifiedKFold import matplotlib.pyplot as plt from imblearn.over_sampling import BorderlineSMOTE import matplotlib as mpl import matplotlib #from collections import Counter path1 = r'/Users/ada/Desktop/xgboost/no.2/newlasso0720.csv' data1 = np.loadtxt(path1, delimiter=',') label_1 = np.ones((int(178), 1)) #Value can be changed label_2 = np.zeros((int(226), 1)) label = np.append(label_1, label_2) smo = BorderlineSMOTE(kind='borderline-1', sampling_strategy={ 0: 246, 1: 246 }) #kind='borderline-2' X_smo, y_smo = smo.fit_sample(data1, label) X = X_smo y = y_smo sepscores = [] cv_clf = XGBClassifier( base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8, gamma=0.4, # learning_rate=0.1, # max_delta_step=0, max_depth=4, # min_child_weight=1, missing=None, n_estimators=100, #
def resample_train_data(x_train, y_train, over=True): """ Currently testing methods or re-sampling imbalanced dataset. :param x_train: Training explanatory features to be re-sampled :param y_train: Training explained features to be re-sampled :param over: kwarg to oversample data :return: x_train_res, y_train_res (re-sampled training dataset) """ if over: rs = BorderlineSMOTE( sampling_strategy="auto", random_state=69, k_neighbors=5, n_jobs=8, m_neighbors=10, kind="borderline-1", ) else: rs = NeighbourhoodCleaningRule( sampling_strategy="auto", return_indices=False, random_state=69, n_neighbors=3, kind_sel="all", threshold_cleaning=0.1, n_jobs=8, ratio=None, ) # rs = NearMiss( # sampling_strategy="auto", # return_indices=False, # random_state=69, # version=1, # n_neighbors=3, # n_neighbors_ver3=3, # n_jobs=8, # ratio=None, # ) print("Before reSampling, the shape of train_X: {}".format( x_train.shape)) print("Before reSampling, the shape of train_y: {} \n".format( y_train.shape)) print("Before reSampling, counts of label '1': {}".format( sum(y_train == 1))) print("Before reSampling, counts of label '0': {}".format( sum(y_train == 0))) x_train_res, y_train_res = rs.fit_sample(x_train, y_train) print("After reSampling, the shape of train_X: {}".format( x_train_res.shape)) print("After reSampling, the shape of train_y: {} \n".format( y_train_res.shape)) print("After reSampling, counts of label '1': {}".format( sum(y_train_res == 1))) print("After reSampling, counts of label '0': {}".format( sum(y_train_res == 0))) return x_train_res, y_train_res
]) target = 'class' data = 'fault', 'road', 'river', 'lithology', 'elevation', 'slope', 'NDVI', 'profile', 'plan', 'aspect', 'geological', 'rain', 'SPI', 'TWI', 'TRI', 'STI', 'LUCC' x_columns = [x for x in df.columns if x not in [target]] x = df[x_columns] y = df['class'] groupby_data_orgianl = df.groupby( 'class').count() # Classified summary of "class" print(groupby_data_orgianl ) # print the classification distribution of the original sample set # Use BorderlineSMOTE to oversample model_bsmote = BorderlineSMOTE() # build BorderlineSMOTE object x_bsmote_resampled, y_bsmote_resampled = model_bsmote.fit_sample( x, y) # input data to oversample x_bsmote_resampled = pd.DataFrame(x_bsmote_resampled, columns=[ 'fault', 'road', 'river', 'lithology', 'elevation', 'slope', 'NDVI', 'profile', 'plan', 'aspect', 'geological', 'rain', 'SPI', 'TWI', 'TRI', 'STI', 'LUCC' ]) y_bsmote_resampled = pd.DataFrame(y_bsmote_resampled, columns=['class']) bsmote_resampled = pd.concat([x_bsmote_resampled, y_bsmote_resampled], axis=1) groupby_data_bsmote = bsmote_resampled.groupby( 'class').count() #Classified summary of "class" print( groupby_data_bsmote ) # Print the sample classification distribution of the output dataset processed by BorderlineSMOTE exp = DataFrame(bsmote_resampled)
def borderline_smote(x, y): print("----Borderline SMOTE----") sampler = BorderlineSMOTE(random_state=42) X, y = sampler.fit_sample(x, y) return X, y
from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import GradientBoostingClassifier from prepare import readbunchobj data = readbunchobj('dataset_woe.data') X_train = pd.DataFrame(data.X_train) X_test = data.X_test y_train = data.y_train y_test = data.y_test # osp = RandomUnderSampler(random_state=10) osp = BorderlineSMOTE() X_train, y_train = osp.fit_sample(X_train, y_train) # SMOTE # fsel = SelectFromModel(GradientBoostingClassifier()) # X_train = fsel.fit_transform(X_train, y_train) # X_test = fsel.transform(X_test) clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) c_m = metrics.confusion_matrix(y_test, y_pred) print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0], c_m[1][1], c_m[0][1])) print("召回率:%.4f" % metrics.recall_score(y_test, y_pred)) print("查准率:%.4f" % metrics.precision_score(y_test, y_pred)) print("F1:%.4f" % metrics.f1_score(y_test, y_pred))
def test_borderline_smote_wrong_kind(): bsmote = BorderlineSMOTE(kind='rand') with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_sample(X, Y)
NNperformance(init_mode,act,opt,n_top_features,epochs,batch_size,labels,X_train_sfs_scaled, y_train,X_test_sfs_scaled, y_test) from imblearn.over_sampling import SMOTE,RandomOverSampler,BorderlineSMOTE from imblearn.under_sampling import NearMiss,RandomUnderSampler smt = SMOTE() nr = NearMiss() bsmt=BorderlineSMOTE(random_state=42) ros=RandomOverSampler(random_state=42) rus=RandomUnderSampler(random_state=42) X_train_bal, y_train_bal = bsmt.fit_sample(X_train_sfs_scaled, y_train) print(np.bincount(y_train_bal)) NNperformance(init_mode,act,opt,n_top_features,epochs,batch_size,labels,X_train_bal, y_train_bal,X_test_sfs_scaled, y_test) #Plot decision region def plot_classification(model,X_t,y_t): clf=model pca = PCA(n_components = 2) X_t2 = pca.fit_transform(X_t) clf.fit(X_t2,np.array(y_t)) plot_dr(X_t2, np.array(y_t), clf=clf, legend=2) model_bal=NNmodel(init_mode,act,opt,n_top_features=2) plot_classification(model_bal,X_test_sfs_scaled, y_test)
var = selector.variances_ plt.bar(select_features, var) #plt.show() ''' #相关系数法 ''' kbest = SelectKBest(chi2, k = 10) kbest.fit_transform(abs(train_x), train_y) a = kbest.scores_ plt.bar(select_features, a) plt.show() ''' """ ======================== Oversampling ============================= """ over_samples = BorderlineSMOTE(random_state=2020) over_samples_x, over_samples_y = over_samples.fit_sample(train_x, train_y) over_samples_x = pd.DataFrame(over_samples_x) over_samples_x.columns = select_features #print(pd.Series(over_samples_y).value_counts()/len(over_samples_y)) """ ======================== Decomposition ============================= """ ''' pca = PCA(n_components = 'mle') #pca = SparsePCA(n_components = 15) pca.fit(over_samples_x) train_x_new = pca.transform(over_samples_x) test_x_new = pca.transform(test_x) ''' ''' pca = PCA(n_components = 'mle') #pca = SparsePCA(n_components = 15) pca.fit(train_x)
# print('平均KS指标为:' + str(np.mean(kss))) # print(gv.best_estimator_,gv.best_score_,gv.best_params_) # y_pred = gv.predict(X) # y_predprob = gbm2.predict_proba(X)[:,1] # print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)) # print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)) imf = pd.DataFrame() for train_index, test_index in sfolder.split(xdata, ydata): train_data = xdata.iloc[train_index, :] train_label = ydata[train_index] test_data = xdata.iloc[test_index, :] test_label = ydata[test_index] x_smo, y_smo = smo.fit_sample(train_data, train_label) # gbdt.fit(pca.fit_transform(x_smo),y_smo) gbdt.fit(x_smo, y_smo) # gbdt.fit(train_data,train_label) # score.append(gbdt.score(pca.transform(test_data),test_label)) score.append(gbdt.score(test_data, test_label)) ypre = pd.Series(gbdt.predict(test_data), name='ypre') # ypre = pd.Series(gbdt.predict(pca.transform(test_data)), name= 'ypre') prob = pd.DataFrame(gbdt.predict_proba(test_data), index=test_data.index) test_label = test_label.reset_index(drop=True) comp = pd.DataFrame([test_label, ypre]).T comp.index = test_data.index comp = pd.merge(comp, prob, on='身份证号码') hcomp = hcomp.append(comp) imf = pd.concat([imf, pd.DataFrame(gbdt.feature_importances_).T]) fpr, tpr, threshold = roc_curve(test_label, ypre)
def classify_by_region(data_frame): get_details(data_frame) print("Before Oversampling By Region\n", data_frame.groupby('region').size()) # sns.countplot(data_frame['region'], label="Count") # plt.show() # sns.heatmap(data_frame.drop('region', axis=1), cmap='cool', annot=True) # plt.show() # get_feature_correlations(data_frame, plot=True, return_resulst=False) X = data_frame.drop(['region', 'class'], axis=1) # Features - drop class from features - 'age', 'sex', y = data_frame['region'] # Labels mutual_info = mutual_info_classif(X, y, discrete_features='auto') print("mutual_info: ", mutual_info) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True) # X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True) sm = BorderlineSMOTE() X_resampled, y_resampled = sm.fit_sample(X_train, y_train) print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size()) # X_resampled.to_csv('resources/data/X_resampled.csv', index=False) # y_resampled.to_csv('resources/data/y_resampled.csv', header=['region'], index=False) ############################################################################### # 4. Scale data # ############################################################################### # sc = StandardScaler() # X_resampled = sc.fit_transform(X_resampled) # X_test = sc.transform(X_test) # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/ # categorical feature selection # sf = SelectKBest(chi2, k='all') # sf_fit = sf.fit(X_train, y_train) # # print feature scores # for i in range(len(sf_fit.scores_)): # print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i])) # # # plot the scores # datset = pd.DataFrame() # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))] # datset['scores'] = sf_fit.scores_ # datset = datset.sort_values(by='scores', ascending=True) # sns.barplot(datset['scores'], datset['feature'], color='blue') # sns.set_style('whitegrid') # plt.ylabel('Categorical Feature', fontsize=18) # plt.xlabel('Score', fontsize=18) # # plt.show() # sel_chi2 = SelectKBest(chi2, k='all') # chi 10 - 0.64, 0.63, 0.60 X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) X_test_chi2 = sel_chi2.transform(X_test) # Spot Check Algorithms # spot_check_algorithms(X_resampled, y_resampled) # models = [SVC(kernel='poly'), RandomForestClassifier(), GradientBoostingClassifier()] # for i in range(len(models)): # # Get the final model # parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 # # # Train the final model # parent_model.fit(X_resampled, y_resampled) # # # Evaluate the final model on the training set # predictions = parent_model.predict(X_resampled) # print_evaluation_results(y_resampled, predictions) # # # Evaluate the final model on the test set # predictions = parent_model.predict(X_test) # print_evaluation_results(y_test, predictions, train=False) # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42)) pipeline = Pipeline( [ # ('selector', SelectKBest(f_classif)), ('model', RandomForestClassifier(n_jobs = -1) ) ] ) # Perform grid search on the classifier using f1 score as the scoring method grid_obj = GridSearchCV( estimator= GradientBoostingClassifier(), param_grid={ # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'n_estimators': [10, 20, 30], 'max_depth': [6, 10, 20, 30], # 'max_depth': [1, 10, 20, 30], 'min_samples_split': [1, 10, 100] # 'model__n_estimators': np.arange(10, 200, 10) # 'C': [1, 10, 100] }, n_jobs=-1, scoring="f1_micro", cv=5, verbose=3 ) # Fit the grid search object to the training data and find the optimal parameters grid_fit = grid_obj.fit(X_resampled, y_resampled) # Get the best estimator best_clf = grid_fit.best_estimator_ print(best_clf) # Get the final model parent_model = best_clf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 t0 = time() # Train the final model parent_model.fit(X_resampled, y_resampled) print("training time:", round(time() - t0, 3), "s") # Evaluate the final model on the training set train_predictions = parent_model.predict(X_resampled) print_evaluation_results(y_resampled, train_predictions) t0 = time() # Evaluate the final model on the test set test_predictions = parent_model.predict(X_test) print("predicting time:", round(time() - t0, 3), "s") print_evaluation_results(y_test, test_predictions, train=False) confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)
data = pd.read_excel('data.xls') #检查是否所有数值都为数字 print(data.applymap(np.isreal).all(axis=0)) data = data.values y = data[:, -1] x = data[:, 0:-1] x_feature = () #筛选的特征序号,若为空代表不进行筛选 if_split_train_test = 1 #是否划分训练集和测试集,如果不划分,训练集和测试集都将是整个数据集 sampling = 0 #是否使用采样技术,0代表不使用采样技术,1代表使用欠采样,2代表使用过采样 if len(x_feature) != 0: x = x[:, x_feature] if if_split_train_test: x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42, test_size=0.3) else: x_train = x_test = x y_train = y_test = y if sampling == 1: nm = NearMiss(version=1) x_train, y_train = nm.fit_sample(x_train, y_train) elif sampling == 2: sm = BorderlineSMOTE(random_state=42, kind="borderline-1") x_train, y_train = sm.fit_sample(x_train, y_train) np.save('x_test.npy', x_test) np.save('y_test.npy', y_test) np.save('x_train.npy', x_train) np.save('y_train.npy', y_train)
_,accuracy = model.evaluate(X_test, y_testv2) acc1.append(accuracy) if threshold==0:#(len(yapay_sample)/2): X_embedded = TSNE(n_components=2).fit_transform(X_train_res) for label, _ in counter.items(): row_ix = where(y_train == int(label))[0] pyplot.scatter(X_embedded[row_ix, 0], X_embedded[row_ix, 1], label=str(int(label))) pyplot.title("Sythentitic Data with Smote - Dataset:"+dataset_name) pyplot.legend() pyplot.show() smborder=BorderlineSMOTE(sampling_strategy=class_dist) X_train_res, y_train_res = smborder.fit_sample(X_train, y_train) X_train_res, y_train_res=shuffle(X_train_res, y_train_res) y_train_resv2 = ohe.fit_transform(y_train_res).toarray() y_testv2 = ohe.fit_transform(y_test).toarray() y_train_resv2 = pd.DataFrame(y_train_resv2) y_testv2 = pd.DataFrame(y_testv2) model = Sequential() model.add(Dense(20,kernel_initializer='random_normal', input_dim=inp[''+dataset_name+''], activation='relu')) model.add(Dense(75,kernel_initializer='random_normal', activation='relu')) model.add(Dense(outp[''+dataset_name+''],kernel_initializer='random_normal', activation='softmax')) # compile the keras model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train_res, y_train_resv2, epochs=100, batch_size=36) _,accuracy = model.evaluate(X_test, y_testv2)
# pageblock数据 # data = pd.read_csv('C:\\Users\hasee\Desktop\毕业论文\data\pageblock\pageblock.csv', header=None) # data[10] = data[10].replace([1, 2, 3, 4, 5], ['0', '0', '1', '1', '1']) # data = data[data[10].isin(['0', '1'])] # data[10] = data[10].astype('int') # featurelist = data.columns.values.tolist()[:-1] # X = np.array(data.loc[:, featurelist]) # Y = data.iloc[:, -1].map(lambda x: -1 if x == 0 else 1) sm = SMOTE(random_state=3) ada = ADASYN(random_state=3) bsm = BorderlineSMOTE(random_state=3) X_resampled_smote, y_resampled_smote = sm.fit_sample(X, Y) X_resampled_adasyn, y_resampled_adasyn = ada.fit_sample(X, Y) X_resampled_bsmote, y_resampled_bsmote = bsm.fit_sample(X, Y) inX = list(X) inY = list(Y) X_g, Y_g = methods.MWMOTE(inX, inY, 2700) z = np.array(X_g) w = pd.Series(Y_g) YY = list(Y) YY.extend(Y_g) fin_y = pd.Series(YY) fin_X = np.vstack((X, z)) RF_test_auc_list = [] SVM_test_auc_list = [] SMOTE_RF_test_auc_list = []
imp = SimpleImputer(strategy='mean') # 均值 单变量插补 X_train = imp.fit_transform(X_train) # 训练集插补 X_test = imp.transform(X_test) # 测试集插补 prep = StandardScaler() X_train = prep.fit_transform(X_train) X_test = prep.transform(X_test) ops_ada = ADASYN(random_state=10) ops_bsmote = BorderlineSMOTE(random_state=10) ops_ksmote = KMeansSMOTE(random_state=10) ops_rs = RandomOverSampler(random_state=10) ops_s = SMOTE(random_state=10) X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train) X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train) X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train) X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train) X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train) dic_ = { 'ADASYN': [X_train_ada, y_train_ada], 'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote], 'RandomOverSampler': [X_train_rs, y_train_rs], 'SMOTE': [X_train_s, y_train_s] } for t in dic_.keys(): print('over sampler: %s \n' % t) X_ = dic_[t][0] y_ = dic_[t][1]
lgb_dtrain4 = lgb.Dataset(data = pd.DataFrame(X_train_res3), label = pd.DataFrame(y_train_res3)) # 학습 데이터를 LightGBM 모델에 맞게 변환 lgb_param4 = {'max_depth': 10, # 트리 깊이 'learning_rate': 0.01, # Step Size 'n_estimators': 50, # Number of trees, 트리 생성 개수 'objective': 'multiclass', # 목적 함수 'num_class': len(set(pd.DataFrame(y_train_res3))) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다. lgb_model4 = lgb.train(params = lgb_param4, train_set = lgb_dtrain4) # 학습 진행 lgb_model4_predict = np.argmax(lgb_model4.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측 model_evaluation(y_test, lgb_model4_predict) # 모델 분류 평가 결과 ## 비율이 30%가 적당하다. 그럼 BLSM과 비교해보자! # BLSM (Borderline SMOTE) from imblearn.over_sampling import BorderlineSMOTE sm4 = BorderlineSMOTE(random_state = 42, sampling_strategy = 0.6) # BLSM 알고리즘 적용 X_train_res4, y_train_res4 = sm4.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용 lgb_dtrain5 = lgb.Dataset(data = pd.DataFrame(X_train_res4), label = pd.DataFrame(y_train_res4)) # 학습 데이터를 LightGBM 모델에 맞게 변환 lgb_param5 = {'max_depth': 10, # 트리 깊이 'learning_rate': 0.01, # Step Size 'n_estimators': 50, # Number of trees, 트리 생성 개수 'objective': 'multiclass', # 목적 함수 'num_class': len(set(pd.DataFrame(y_train_res4))) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다. lgb_model5 = lgb.train(params = lgb_param5, train_set = lgb_dtrain5) # 학습 진행 lgb_model5_predict = np.argmax(lgb_model5.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측 model_evaluation(y_test, lgb_model5_predict) # 모델 분류 평가 결과 # BLSM보다 기본 SMOTE가 성능이 좋다. 이를 바탕으로 다양한 모델에 적용 - 선형회귀(로지스틱), Random Forest, CatBoost # BLSM을 이용해서 Oversampling 한 학습 데이터 셋 : X_train_res2, y_train_res2