예제 #1
0
def main(path, began, is_cent_data=0, iterations=30, temperature=5, attractive_force=1, repulsive_force=0.4, speed=0.02, k=0.5):
    # 读取数据
    # 读取df类型的归一化数据
    my_data = tool.unitilize_data(tool.read_KEEL_data(path, began))
    # 准备训练数据和测试数据
    # 使用fr模型和smote模型数理数据,形成新的数据
    # 使用处理完成的数据,进行建模,并预测
    # 是否对数据进行中心化处理
    if is_cent_data != 0:
        cent_point = get_cent_point(my_data)
    else:
        cent_point = np.array(my_data)
    fr_data = pd.DataFrame(fr(cent_point,iterations, temperature, attractive_force, repulsive_force, speed, k))
    fr_data_x = fr_data.iloc[:, 0:-1]
    fr_data_y = fr_data.iloc[:, -1]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    # 随机抽取测试数据的训练数据
    X_train, X_test, Y_train, Y_test = train_test_split(fr_data_x, fr_data_y, test_size=0.3, random_state=42)
    # 根据抽取的训练数据,生成平衡过后的数据集
    my_B_SMOTE = B_SMOTE()
    fr_data_x_smote, fr_data_y_smote = my_B_SMOTE.fit_sample(X_train, Y_train)

    #经过处理的数数据
    print("经过处理的数据:")
    train(fr_data_x_smote, fr_data_y_smote, X_test, Y_test)

    #未经处理的数据

    X_train_org, X_test_org, Y_train_org, Y_test_org = train_test_split(my_data.iloc[:, 0:-1], my_data.iloc[:, -1], test_size=0.3, random_state=42)

    data_x_org, data_y_org = my_B_SMOTE.fit_sample(X_train_org,  Y_train_org)
    print("未处理的数据:")
    train(data_x_org, data_y_org, X_test_org, Y_test_org)
예제 #2
0
def oversample_with_smote(x_train, y_train, iterator=10):
    '''
    SMOTE를 이용하여 데이터를 oversampling 해줌.
    :param x_train: 모델에 입력되는 데이터
    :param y_train: 모델이 예측할 타겟
    :param iterator: sampling 반복 정도
    :return: oversampling 된 X, Y
    '''
    sm = BorderlineSMOTE()
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)
    x_train_fin = []
    y_train_fin = []
    for i in range(iterator):
        temp_x = []
        temp_y = []
        indexes = list(range(len(y_train_sm)))
        random.shuffle(indexes)
        cnt = 0
        max_cnt = len(y_train_sm) // 10
        for j in indexes:
            x = x_train_sm[j]
            y = y_train_sm[j]
            if y == i % 2:
                temp_x.append(x)
                temp_y.append(y)
            elif cnt < max_cnt:
                temp_x.append(x)
                temp_y.append(y)
                cnt += 1
        x_sm_new, y_sm_new = sm.fit_sample(temp_x, temp_y)
        x_train_fin.extend(x_sm_new)
        y_train_fin.extend(y_sm_new)
    return x_train_fin, y_train_fin
예제 #3
0
def test_borderline_smote(kind):
    bsmote = BorderlineSMOTE(kind=kind, random_state=42)
    bsmote_nn = BorderlineSMOTE(kind=kind,
                                random_state=42,
                                k_neighbors=NearestNeighbors(n_neighbors=6),
                                m_neighbors=NearestNeighbors(n_neighbors=11))

    X_res_1, y_res_1 = bsmote.fit_sample(X, Y)
    X_res_2, y_res_2 = bsmote_nn.fit_sample(X, Y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
예제 #4
0
파일: sk_proc.py 프로젝트: ikem55/NRAsystem
 def del_set_smote_data(self):
     """ 学習データのSMOTE処理を行い学習データを更新する  """
     # 対象数が少ない場合はサンプリングレートを下げる
     positive_count_train = self.y_train.sum()
     negative_count_train = len(self.y_train) - positive_count_train
     print("check y_train value 0:" + str(negative_count_train) + " 1:" +
           str(positive_count_train))
     if positive_count_train >= 6:
         smote = BorderlineSMOTE()
         self.X_train, self.y_train = smote.fit_sample(
             self.X_train, self.y_train)
     else:
         print("----- RandomOverSampler ----- ")
         ros = RandomOverSampler(
             # ratio={1: self.X_train.shape[0], 0: self.X_train.shape[0] // 3}, random_state=71)
             ratio={
                 1: negative_count_train,
                 0: negative_count_train
             },
             random_state=71)
         # 学習用データに反映
         self.X_train, self.y_train = ros.fit_sample(
             self.X_train, self.y_train)
     print("-- after sampling: " +
           str(np.unique(self.y_train, return_counts=True)))
예제 #5
0
        def Smote_bd(
                data,
                label):  #样本的近邻至少有一半是其他类,(此时样本被称为危险样本)最近邻中的随机样本b与该少数类样本a来自于不同的类
            from imblearn.over_sampling import BorderlineSMOTE

            smote = BorderlineSMOTE(random_state=0)
            data_smote_bd, label_smote_bd = smote.fit_sample(data, label)
            return data_smote_bd, label_smote_bd
예제 #6
0
파일: dam3.py 프로젝트: amir-abolfazli/DAM3
    def oversample_remainingSet(self, instances, labels, kind='borderline-1'):
        """oversamples remaining set (using BorderlineSMOTE) after a drift is detected."""
        if len(np.unique(labels)) >= 2:
            minority_class = collections.Counter(labels.tolist()).most_common()[-1][0]

            if np.sum(labels == minority_class) > self.n_neighbors:
                oversample = BorderlineSMOTE(k_neighbors=self.n_neighbors, m_neighbors=5, kind=kind, random_state=self.random_state)
                instances, labels = oversample.fit_sample(instances, labels)

        return instances, labels
예제 #7
0
 def classification(self,X,Y):
     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
     #text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])
     vectorizer = TfidfVectorizer()
     # vectorizer2 = TfidfVectorizer()
     X_train_tfidf = vectorizer.fit_transform(X_train)
     X_test_tfidf = vectorizer.transform(X_test)
     sm = BorderlineSMOTE()
     X_res, Y_res = sm.fit_sample(X_train_tfidf, y_train)
     clf = MultinomialNB()
     clf.fit(X_res, Y_res)
     prediction = clf.predict(X_test_tfidf)
     print(prediction)
     final_time = start_time - datetime.datetime.now()
     print(final_time)
     print(metrics.classification_report(y_test,prediction))
     print(metrics.roc_auc_score(y_test, prediction))
예제 #8
0
def _SMOTE_Border(self):
    # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique

    print("before SMOTE df", self.x_train.shape)
    smote = BorderlineSMOTE(
        k_neighbors=5, m_neighbors=5, random_state=self.seed
    )  # sampling_strategy=0.8
    self.X_train_smote, self.y_train_smote = smote.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    print("len smote: \n", len(self.X_train_smote))
    print("len new x_train: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))
예제 #9
0
def classify_by_region(data_frame):
    X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1)  # Features - drop region, class
    y = data_frame[TOP_LEVEL_TARGET]  # Labels


    # get_feature_correlations(data_frame, plot=True, return_resulst=False)
    # mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    # print("mutual_info: ", mutual_info)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True)


    ##########   Handle Class Imabalnce  #########
    sm = BorderlineSMOTE()
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size())

    ###############################################################################
    #                               4. Scale data                                 #
    ###############################################################################
    # sc = StandardScaler()
    # X_resampled = sc.fit_transform(X_resampled)
    # X_test = sc.transform(X_test)




    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # # plt.show()
    #
    sel_chi2 = SelectKBest(chi2, k='all')  # chi 10 - 0.64, 0.63, 0.60
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    X_test_chi2 = sel_chi2.transform(X_test)






    # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42))


    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)




    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)





    # pipeline = Pipeline(
    #         [
    #             # ('selector', SelectKBest(f_classif)),
    #             ('model',  RandomForestClassifier(n_jobs = -1) )
    #         ]
    # )
    #
    # # Perform grid search on the classifier using f1 score as the scoring method
    # grid_obj = GridSearchCV(
    #         estimator= GradientBoostingClassifier(),
    #         param_grid={
    #             # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    #             'n_estimators': [10, 20, 30],
    #             'max_depth': [6, 10, 20, 30],
    #             # 'max_depth': [1, 10, 20, 30],
    #             'min_samples_split': [1, 10, 100]
    #             # 'model__n_estimators': np.arange(10, 200, 10)
    #             # 'C': [1, 10, 100]
    #         },
    #
    #         n_jobs=-1,
    #         scoring="f1_micro",
    #         cv=5,
    #         verbose=3
    # )
    #
    # # Fit the grid search object to the training data and find the optimal parameters
    # grid_fit =  grid_obj.fit(X_resampled, y_resampled)

    # # Get the best estimator
    # best_clf = grid_fit.best_estimator_
    # print(best_clf)


    # Get the final model
    parent_model =  SVC(kernel = 'rbf', C = 10)#KNN(n_neighbors = 7)-0.52 # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)

    # Plot normalized confusion matrix
    # fig = plt.figure()
    # fig.set_size_inches(8, 8, forward=True)
    # # fig.align_labels()
    # plot_confusion_matrix(cnf_matrix, classes=["1", "2", "3", "4"], normalize=False, title='Normalized confusion matrix')


    # probs = parent_model.predict_proba(X_test)
    # print("Prediction probabilities for Region\n", probs)
    # plotConfusionMatrix(X_test, y_test, ['1', '2', '3', '4'])

    joblib.dump(parent_model, filename='../resources/models/parent_classifier.pkl')
예제 #10
0
    def runSMOTEvariationsGen(self, folder):
        """
        Create files with SMOTE preprocessing and without preprocessing.
        :param datasets: datasets.
        :param folder:   cross-validation folders.
        :return:
        """
        smote = SMOTE()
        borderline1 = BorderlineSMOTE(kind='borderline-1')
        borderline2 = BorderlineSMOTE(kind='borderline-2')
        smoteSVM = SVMSMOTE()
        geometric_smote = GeometricSMOTE(n_jobs=-1)

        for dataset in datasets:  # biclass e multiclass
            for fold in range(5):
                path = os.path.join(folder, dataset, str(fold),
                                    ''.join([dataset, "_train.csv"]))
                train = np.genfromtxt(path, delimiter=',')
                X = train[:, 0:train.shape[1] - 1]
                Y = train[:, train.shape[1] - 1]

                # SMOTE
                print("SMOTE..." + dataset)
                X_res, y_res = smote.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(folder, dataset, str(fold),
                                             ''.join([dataset, "_SMOTE.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE1
                print("Borderline1..." + dataset)
                X_res, y_res = borderline1.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline1.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE2
                print("Borderline2..." + dataset)
                X_res, y_res = borderline2.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline2.csv"])),
                                header=False,
                                index=False)
                # SMOTE SVM
                print("SMOTE SVM..." + dataset)
                X_res, y_res = smoteSVM.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_smoteSVM.csv"])),
                                header=False,
                                index=False)

                # GEOMETRIC SMOTE
                print("GEOMETRIC SMOTE..." + dataset)
                X_res, y_res = geometric_smote.fit_resample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Geometric_SMOTE.csv"])),
                                header=False,
                                index=False)
                      activation='relu'))
            model.add(
                Dense(2,
                      kernel_initializer='random_normal',
                      activation='softmax'))
            # compile the keras model
            model.compile(loss='categorical_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            model.fit(X_train_res, y_train_resv2, epochs=100, batch_size=36)

            _, accuracy = model.evaluate(X_test, y_testv2)
            acc1.append(accuracy)

            smborder = BorderlineSMOTE(sampling_strategy=class_dist)
            X_train_res, y_train_res = smborder.fit_sample(X_train, y_train)
            X_train_res, y_train_res = shuffle(X_train_res, y_train_res)

            y_train_resv2 = ohe.fit_transform(y_train_res).toarray()
            y_testv2 = ohe.fit_transform(y_test).toarray()
            y_train_resv2 = pd.DataFrame(y_train_resv2)
            y_testv2 = pd.DataFrame(y_testv2)
            model = Sequential()
            model.add(
                Dense(20,
                      kernel_initializer='random_normal',
                      input_dim=list(X_train.shape)[1],
                      activation='relu'))
            model.add(
                Dense(75,
                      kernel_initializer='random_normal',
예제 #12
0
print("Before OverSampling, the shape of X_train: {}".format(X_train.shape)) # SMOTE 적용 이전 데이터 형태
print("Before OverSampling, the shape of y_train: {}".format(y_train.shape)) # SMOTE 적용 이전 데이터 형태
print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape)) # SMOTE 적용 결과 확인
print('After OverSampling, the shape of y_train: {} \n'.format(y_train_res.shape)) # # SMOTE 적용 결과 확인

lgbm_clf2 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier
lgbm_clf2.fit(X_train_res, y_train_res) # 학습 진행
y_pred2 = lgbm_clf2.predict(X_test) # 평가 데이터셋 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred2)) # 혼돈행렬
print('\n')
print("Model Evaluation Result: \n", classification_report(y_test, y_pred2)) # 전체적인 성능 평가

# BLSM (Borderline SMOTE)
from imblearn.over_sampling import BorderlineSMOTE
sm2 = BorderlineSMOTE(random_state = 42) # BLSM 알고리즘 적용
X_train_res2, y_train_res2 = sm2.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용
lgbm_clf3 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier
lgbm_clf3.fit(X_train_res2, y_train_res2) # 학습 진행
y_pred3 = lgbm_clf3.predict(X_test) # 평가 데이터셋 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred3)) # 혼돈행렬
print('\n')
print("Model Evaluation Result: \n", classification_report(y_test, y_pred3)) # 전체적인 성능 평가

# SVMSMOTE
from imblearn.over_sampling import SVMSMOTE
sm3 = SVMSMOTE(random_state = 42) # SVMSMOTE 알고리즘 적용
X_train_res3, y_train_res3 = sm3.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용
lgbm_clf4 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier
lgbm_clf4.fit(X_train_res3, y_train_res3) # 학습 진행
y_pred4 = lgbm_clf4.predict(X_test) # 평가 데이터셋 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred4)) # 혼돈행렬
예제 #13
0
print('Before: Class{}'.format(Counter(kelas)))
df = dataset.copy()
del df[21]

x_train, x_test, y_train, y_test = train_test_split(df,
                                                    kelas,
                                                    test_size=.1,
                                                    random_state=10)

#balancing data
#sm = SMOTETomek()
#sm = SMOTE(random_state=42)
from imblearn.over_sampling import BorderlineSMOTE

sm = BorderlineSMOTE(random_state=42)
df_resm, kelas_res = sm.fit_sample(df, kelas)
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
df_resm2, kelas_res2 = tl.fit_sample(df_resm, kelas_res)
print('After: Class{}'.format(Counter(kelas_res)))
#df_res_vis = pca.transform(df_resm)

besar = dataset.groupby(kelas).size()
besar = list(besar)
koor_x = ['false', 'true']
koor_y = besar
kelas_res = list(kelas_res)
valp = kelas_res.count(False)
valn = kelas_res.count(True)
new_y = []
    'GB': [[0, 0], [0, 0]]
}

svm_sm_scores = {'LR': 0, 'AB': 0, 'GB': 0}

svm_sm_con_mat = {
    'LR': [[0, 0], [0, 0]],
    'AB': [[0, 0], [0, 0]],
    'GB': [[0, 0], [0, 0]]
}

for train_index, test_index in skf.split(features, target):

    # Borderline Smote
    bl_smote = BorderlineSMOTE(random_state=0, kind='borderline-1')
    X_train, y_train = bl_smote.fit_sample(features[train_index],
                                           target[train_index])

    # Logistic Regression
    logistic = LogisticRegression(random_state=0)
    logistic.fit(X_train, y_train)
    res = logistic.predict(features[test_index])
    bl_smote_scores['LR'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['LR'] += confusion_matrix(y_true=target[test_index],
                                               y_pred=res)

    # Ada Boost Classifier
    adaBoost = AdaBoostClassifier(random_state=0)
    adaBoost.fit(X_train, y_train)
    res = adaBoost.predict(features[test_index])
    bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
예제 #15
0
def classify_by_region(data_frame):
    X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET],
                        axis=1)  # Features - drop region, class
    y = data_frame[TOP_LEVEL_TARGET]  # Labels

    # ['age', 'degree-of-diffe', 'sex_2', 'histologic-type_2', 'bone_2',
    #  'neck_2', 'mediastinum_2', 'abdominal_2']

    # data_frame.drop(['lung_2', 'pleura_2', 'peritoneum_2', 'liver_2', 'brain_2', 'skin_2', 'supraclavicular_2',
    #                  'axillar_2', 'bone-marrow_2'], axis=1, inplace=True)
    # get_feature_correlations(data_frame, plot=True, return_resulst=False)
    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    print("mutual_info: ", mutual_info)

    # 0.3 test size = 0.56 f1
    # 0.2 test size = 0.61 f1
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        stratify=y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        shuffle=True)

    # reject_sampler = FunctionSampler(func=outlier_rejection)
    # X_train, y_train = reject_sampler.fit_resample(X_train, y_train)

    # Baseline

    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)

    ##########   Handle Class Imabalnce  #########
    sm = BorderlineSMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n",
          (pd.DataFrame(y_resampled)).groupby('region').size())

    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    sf = SelectKBest(f_classif, k='all')
    sf_fit = sf.fit(X_resampled, y_resampled)
    # print feature scores
    for i in range(len(sf_fit.scores_)):
        print(' %s: %f' % (X_resampled.columns[i], sf_fit.scores_[i]))

    # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # plt.show()

    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)

    model = RandomForestClassifier(RANDOM_STATE)

    ########################################### Hyper-parameter Tuning ##########################################
    best_clf_rf = tune_random_forest(model, X_resampled, y_resampled)

    # g = GridSearchCV(
    #     estimator=GradientBoostingClassifier(),
    #     param_grid={
    #         "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
    #          "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
    #          "min_child_weight" : [ 1, 3, 5, 7 ],
    #          "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
    #          "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    #     },
    #     n_jobs=-1,
    #     scoring="f1_micro",
    #     cv=5,
    #     verbose=1
    # )
    # # Fit the grid search object to the training data and find the optimal parameters
    # grid_fit = grid_obj.fit(X_resampled, y_resampled)
    #
    # # Get the best estimator
    # best_clf_gb= grid_fit.best_estimator_
    # print(best_clf_gb)

    ########################################### Final Model ###########################################
    parent_model = best_clf_rf  # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)

    # https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
    sel = SelectFromModel(best_clf_rf)
    sel.fit(X_resampled, y_resampled)
    print(sel.get_support())
    selected_feat = X_resampled.columns[(sel.get_support())]
    print(len(selected_feat))
    print(selected_feat)
예제 #16
0
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from imblearn.over_sampling import BorderlineSMOTE
import matplotlib as mpl
import matplotlib
#from collections import Counter
path1 = r'/Users/ada/Desktop/xgboost/no.2/newlasso0720.csv'
data1 = np.loadtxt(path1, delimiter=',')
label_1 = np.ones((int(178), 1))  #Value can be changed
label_2 = np.zeros((int(226), 1))
label = np.append(label_1, label_2)
smo = BorderlineSMOTE(kind='borderline-1', sampling_strategy={
    0: 246,
    1: 246
})  #kind='borderline-2'
X_smo, y_smo = smo.fit_sample(data1, label)
X = X_smo
y = y_smo

sepscores = []
cv_clf = XGBClassifier(
    base_score=0.5,
    colsample_bylevel=1,
    colsample_bytree=0.8,
    gamma=0.4,  #
    learning_rate=0.1,  #
    max_delta_step=0,
    max_depth=4,  #
    min_child_weight=1,
    missing=None,
    n_estimators=100,  #
예제 #17
0
    def resample_train_data(x_train, y_train, over=True):
        """
        Currently testing methods or re-sampling imbalanced dataset.
        :param x_train: Training explanatory features to be re-sampled
        :param y_train: Training explained features to be re-sampled
        :param over: kwarg to oversample data
        :return: x_train_res, y_train_res (re-sampled training dataset)
        """
        if over:
            rs = BorderlineSMOTE(
                sampling_strategy="auto",
                random_state=69,
                k_neighbors=5,
                n_jobs=8,
                m_neighbors=10,
                kind="borderline-1",
            )
        else:
            rs = NeighbourhoodCleaningRule(
                sampling_strategy="auto",
                return_indices=False,
                random_state=69,
                n_neighbors=3,
                kind_sel="all",
                threshold_cleaning=0.1,
                n_jobs=8,
                ratio=None,
            )
            # rs = NearMiss(
            #     sampling_strategy="auto",
            #     return_indices=False,
            #     random_state=69,
            #     version=1,
            #     n_neighbors=3,
            #     n_neighbors_ver3=3,
            #     n_jobs=8,
            #     ratio=None,
            # )

        print("Before reSampling, the shape of train_X: {}".format(
            x_train.shape))
        print("Before reSampling, the shape of train_y: {} \n".format(
            y_train.shape))

        print("Before reSampling, counts of label '1': {}".format(
            sum(y_train == 1)))
        print("Before reSampling, counts of label '0': {}".format(
            sum(y_train == 0)))

        x_train_res, y_train_res = rs.fit_sample(x_train, y_train)

        print("After reSampling, the shape of train_X: {}".format(
            x_train_res.shape))
        print("After reSampling, the shape of train_y: {} \n".format(
            y_train_res.shape))

        print("After reSampling, counts of label '1': {}".format(
            sum(y_train_res == 1)))
        print("After reSampling, counts of label '0': {}".format(
            sum(y_train_res == 0)))

        return x_train_res, y_train_res
                 ])

target = 'class'
data = 'fault', 'road', 'river', 'lithology', 'elevation', 'slope', 'NDVI', 'profile', 'plan', 'aspect', 'geological', 'rain', 'SPI', 'TWI', 'TRI', 'STI', 'LUCC'

x_columns = [x for x in df.columns if x not in [target]]
x = df[x_columns]
y = df['class']
groupby_data_orgianl = df.groupby(
    'class').count()  # Classified summary of "class"
print(groupby_data_orgianl
      )  # print the classification distribution of the original sample set

# Use BorderlineSMOTE to oversample
model_bsmote = BorderlineSMOTE()  #  build BorderlineSMOTE object
x_bsmote_resampled, y_bsmote_resampled = model_bsmote.fit_sample(
    x, y)  # input data to oversample
x_bsmote_resampled = pd.DataFrame(x_bsmote_resampled,
                                  columns=[
                                      'fault', 'road', 'river', 'lithology',
                                      'elevation', 'slope', 'NDVI', 'profile',
                                      'plan', 'aspect', 'geological', 'rain',
                                      'SPI', 'TWI', 'TRI', 'STI', 'LUCC'
                                  ])
y_bsmote_resampled = pd.DataFrame(y_bsmote_resampled, columns=['class'])
bsmote_resampled = pd.concat([x_bsmote_resampled, y_bsmote_resampled], axis=1)
groupby_data_bsmote = bsmote_resampled.groupby(
    'class').count()  #Classified summary of "class"
print(
    groupby_data_bsmote
)  # Print the sample classification distribution of the output dataset processed by BorderlineSMOTE
exp = DataFrame(bsmote_resampled)
def borderline_smote(x, y):
    print("----Borderline SMOTE----")
    sampler = BorderlineSMOTE(random_state=42)
    X, y = sampler.fit_sample(x, y)
    return X, y
예제 #20
0
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

from prepare import readbunchobj
data = readbunchobj('dataset_woe.data')
X_train = pd.DataFrame(data.X_train)
X_test = data.X_test
y_train = data.y_train
y_test = data.y_test

# osp = RandomUnderSampler(random_state=10)

osp = BorderlineSMOTE()
X_train, y_train = osp.fit_sample(X_train, y_train)  # SMOTE

# fsel = SelectFromModel(GradientBoostingClassifier())
# X_train = fsel.fit_transform(X_train, y_train)
# X_test = fsel.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
c_m = metrics.confusion_matrix(y_test, y_pred)
print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0],
                                                    c_m[1][1], c_m[0][1]))
print("召回率:%.4f" % metrics.recall_score(y_test, y_pred))
print("查准率:%.4f" % metrics.precision_score(y_test, y_pred))
print("F1:%.4f" % metrics.f1_score(y_test, y_pred))
예제 #21
0
def test_borderline_smote_wrong_kind():
    bsmote = BorderlineSMOTE(kind='rand')
    with pytest.raises(ValueError, match='The possible "kind" of algorithm'):
        bsmote.fit_sample(X, Y)
NNperformance(init_mode,act,opt,n_top_features,epochs,batch_size,labels,X_train_sfs_scaled, y_train,X_test_sfs_scaled, y_test)






from imblearn.over_sampling import SMOTE,RandomOverSampler,BorderlineSMOTE
from imblearn.under_sampling import NearMiss,RandomUnderSampler
smt = SMOTE()
nr = NearMiss()
bsmt=BorderlineSMOTE(random_state=42)
ros=RandomOverSampler(random_state=42)
rus=RandomUnderSampler(random_state=42)
X_train_bal, y_train_bal = bsmt.fit_sample(X_train_sfs_scaled, y_train)
print(np.bincount(y_train_bal))
  
NNperformance(init_mode,act,opt,n_top_features,epochs,batch_size,labels,X_train_bal, y_train_bal,X_test_sfs_scaled, y_test)


#Plot decision region
def plot_classification(model,X_t,y_t):
    clf=model
    pca = PCA(n_components = 2)
    X_t2 = pca.fit_transform(X_t)
    clf.fit(X_t2,np.array(y_t))
    plot_dr(X_t2, np.array(y_t), clf=clf, legend=2)

model_bal=NNmodel(init_mode,act,opt,n_top_features=2)
plot_classification(model_bal,X_test_sfs_scaled, y_test)
예제 #23
0
var = selector.variances_
plt.bar(select_features, var)
#plt.show()
'''
#相关系数法
'''
kbest = SelectKBest(chi2, k = 10)
kbest.fit_transform(abs(train_x), train_y)
a = kbest.scores_
plt.bar(select_features, a)
plt.show()
'''
""" ========================  Oversampling ============================= """

over_samples = BorderlineSMOTE(random_state=2020)
over_samples_x, over_samples_y = over_samples.fit_sample(train_x, train_y)
over_samples_x = pd.DataFrame(over_samples_x)
over_samples_x.columns = select_features
#print(pd.Series(over_samples_y).value_counts()/len(over_samples_y))
""" ========================  Decomposition  ============================= """
'''
pca = PCA(n_components = 'mle')
#pca = SparsePCA(n_components = 15)
pca.fit(over_samples_x)
train_x_new = pca.transform(over_samples_x)
test_x_new = pca.transform(test_x)
'''
'''
pca = PCA(n_components = 'mle')
#pca = SparsePCA(n_components = 15)
pca.fit(train_x)
예제 #24
0
# print('平均KS指标为:' + str(np.mean(kss)))

# print(gv.best_estimator_,gv.best_score_,gv.best_params_)
# y_pred = gv.predict(X)
# y_predprob = gbm2.predict_proba(X)[:,1]
# print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

imf = pd.DataFrame()

for train_index, test_index in sfolder.split(xdata, ydata):
    train_data = xdata.iloc[train_index, :]
    train_label = ydata[train_index]
    test_data = xdata.iloc[test_index, :]
    test_label = ydata[test_index]
    x_smo, y_smo = smo.fit_sample(train_data, train_label)
    # gbdt.fit(pca.fit_transform(x_smo),y_smo)
    gbdt.fit(x_smo, y_smo)
    # gbdt.fit(train_data,train_label)
    # score.append(gbdt.score(pca.transform(test_data),test_label))
    score.append(gbdt.score(test_data, test_label))
    ypre = pd.Series(gbdt.predict(test_data), name='ypre')
    # ypre = pd.Series(gbdt.predict(pca.transform(test_data)), name= 'ypre')
    prob = pd.DataFrame(gbdt.predict_proba(test_data), index=test_data.index)
    test_label = test_label.reset_index(drop=True)
    comp = pd.DataFrame([test_label, ypre]).T
    comp.index = test_data.index
    comp = pd.merge(comp, prob, on='身份证号码')
    hcomp = hcomp.append(comp)
    imf = pd.concat([imf, pd.DataFrame(gbdt.feature_importances_).T])
    fpr, tpr, threshold = roc_curve(test_label, ypre)
예제 #25
0
def classify_by_region(data_frame):
    get_details(data_frame)
    print("Before Oversampling By Region\n", data_frame.groupby('region').size())
    # sns.countplot(data_frame['region'], label="Count")
    # plt.show()

    # sns.heatmap(data_frame.drop('region', axis=1), cmap='cool', annot=True)
    # plt.show()

    # get_feature_correlations(data_frame, plot=True, return_resulst=False)


    X = data_frame.drop(['region', 'class'], axis=1)  # Features - drop class from features - 'age', 'sex',
    y = data_frame['region']  # Labels


    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    print("mutual_info: ", mutual_info)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    # X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)


    sm = BorderlineSMOTE()
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size())
    # X_resampled.to_csv('resources/data/X_resampled.csv', index=False)
    # y_resampled.to_csv('resources/data/y_resampled.csv', header=['region'], index=False)



    ###############################################################################
    #                               4. Scale data                                 #
    ###############################################################################
    # sc = StandardScaler()
    # X_resampled = sc.fit_transform(X_resampled)
    # X_test = sc.transform(X_test)



    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # # plt.show()
    #
    sel_chi2 = SelectKBest(chi2, k='all')  # chi 10 - 0.64, 0.63, 0.60
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    X_test_chi2 = sel_chi2.transform(X_test)



    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)



    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)



    # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42))

    pipeline = Pipeline(
            [
                # ('selector', SelectKBest(f_classif)),
                ('model',  RandomForestClassifier(n_jobs = -1) )
            ]
    )

    # Perform grid search on the classifier using f1 score as the scoring method
    grid_obj = GridSearchCV(
            estimator= GradientBoostingClassifier(),
            param_grid={
                # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                'n_estimators': [10, 20, 30],
                'max_depth': [6, 10, 20, 30],
                # 'max_depth': [1, 10, 20, 30],
                'min_samples_split': [1, 10, 100]
                # 'model__n_estimators': np.arange(10, 200, 10)
                # 'C': [1, 10, 100]
            },

            n_jobs=-1,
            scoring="f1_micro",
            cv=5,
            verbose=3
    )

    # Fit the grid search object to the training data and find the optimal parameters
    grid_fit =  grid_obj.fit(X_resampled, y_resampled)

    # Get the best estimator
    best_clf = grid_fit.best_estimator_
    print(best_clf)




    # Get the final model
    parent_model = best_clf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)
예제 #26
0
파일: data.py 프로젝트: love-xj/project
data = pd.read_excel('data.xls')
#检查是否所有数值都为数字
print(data.applymap(np.isreal).all(axis=0))
data = data.values
y = data[:, -1]
x = data[:, 0:-1]
x_feature = ()  #筛选的特征序号,若为空代表不进行筛选
if_split_train_test = 1  #是否划分训练集和测试集,如果不划分,训练集和测试集都将是整个数据集
sampling = 0  #是否使用采样技术,0代表不使用采样技术,1代表使用欠采样,2代表使用过采样
if len(x_feature) != 0:
    x = x[:, x_feature]
if if_split_train_test:
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        stratify=y,
                                                        random_state=42,
                                                        test_size=0.3)
else:
    x_train = x_test = x
    y_train = y_test = y
if sampling == 1:
    nm = NearMiss(version=1)
    x_train, y_train = nm.fit_sample(x_train, y_train)
elif sampling == 2:
    sm = BorderlineSMOTE(random_state=42, kind="borderline-1")
    x_train, y_train = sm.fit_sample(x_train, y_train)
np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)
np.save('x_train.npy', x_train)
np.save('y_train.npy', y_train)
     _,accuracy = model.evaluate(X_test, y_testv2)
     acc1.append(accuracy)   
 
     
     if threshold==0:#(len(yapay_sample)/2):
         X_embedded = TSNE(n_components=2).fit_transform(X_train_res)
         for label, _ in counter.items():
             row_ix = where(y_train == int(label))[0]
             pyplot.scatter(X_embedded[row_ix, 0], X_embedded[row_ix, 1], label=str(int(label)))
         pyplot.title("Sythentitic Data with Smote - Dataset:"+dataset_name)
         pyplot.legend()
         pyplot.show()
         
     
     smborder=BorderlineSMOTE(sampling_strategy=class_dist)
     X_train_res, y_train_res = smborder.fit_sample(X_train, y_train)
     X_train_res, y_train_res=shuffle(X_train_res, y_train_res)
     
     y_train_resv2 = ohe.fit_transform(y_train_res).toarray()
     y_testv2 = ohe.fit_transform(y_test).toarray()
     y_train_resv2 = pd.DataFrame(y_train_resv2)
     y_testv2 = pd.DataFrame(y_testv2)
     model = Sequential()
     model.add(Dense(20,kernel_initializer='random_normal', input_dim=inp[''+dataset_name+''], activation='relu'))
     model.add(Dense(75,kernel_initializer='random_normal', activation='relu'))
     model.add(Dense(outp[''+dataset_name+''],kernel_initializer='random_normal', activation='softmax'))
     # compile the keras model
     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
     model.fit(X_train_res, y_train_resv2, epochs=100, batch_size=36)
 
     _,accuracy = model.evaluate(X_test, y_testv2)
예제 #28
0
    # pageblock数据
    # data = pd.read_csv('C:\\Users\hasee\Desktop\毕业论文\data\pageblock\pageblock.csv', header=None)
    # data[10] = data[10].replace([1, 2, 3, 4, 5], ['0', '0', '1', '1', '1'])
    # data = data[data[10].isin(['0', '1'])]
    # data[10] = data[10].astype('int')
    # featurelist = data.columns.values.tolist()[:-1]
    # X = np.array(data.loc[:, featurelist])
    # Y = data.iloc[:, -1].map(lambda x: -1 if x == 0 else 1)

    sm = SMOTE(random_state=3)
    ada = ADASYN(random_state=3)
    bsm = BorderlineSMOTE(random_state=3)
    X_resampled_smote, y_resampled_smote = sm.fit_sample(X, Y)
    X_resampled_adasyn, y_resampled_adasyn = ada.fit_sample(X, Y)
    X_resampled_bsmote, y_resampled_bsmote = bsm.fit_sample(X, Y)

    inX = list(X)
    inY = list(Y)
    X_g, Y_g = methods.MWMOTE(inX, inY, 2700)
    z = np.array(X_g)
    w = pd.Series(Y_g)

    YY = list(Y)
    YY.extend(Y_g)
    fin_y = pd.Series(YY)
    fin_X = np.vstack((X, z))

RF_test_auc_list = []
SVM_test_auc_list = []
SMOTE_RF_test_auc_list = []
예제 #29
0
    imp = SimpleImputer(strategy='mean')  # 均值 单变量插补
    X_train = imp.fit_transform(X_train)  # 训练集插补
    X_test = imp.transform(X_test)  # 测试集插补

    prep = StandardScaler()
    X_train = prep.fit_transform(X_train)
    X_test = prep.transform(X_test)

    ops_ada = ADASYN(random_state=10)
    ops_bsmote = BorderlineSMOTE(random_state=10)
    ops_ksmote = KMeansSMOTE(random_state=10)
    ops_rs = RandomOverSampler(random_state=10)
    ops_s = SMOTE(random_state=10)

    X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train)
    X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train)
    X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train)
    X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train)
    X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train)

    dic_ = {
        'ADASYN': [X_train_ada, y_train_ada],
        'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote],
        'RandomOverSampler': [X_train_rs, y_train_rs],
        'SMOTE': [X_train_s, y_train_s]
    }

    for t in dic_.keys():
        print('over sampler: %s \n' % t)
        X_ = dic_[t][0]
        y_ = dic_[t][1]
예제 #30
0
lgb_dtrain4 = lgb.Dataset(data = pd.DataFrame(X_train_res3), label = pd.DataFrame(y_train_res3)) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param4 = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 50, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(pd.DataFrame(y_train_res3))) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model4 = lgb.train(params = lgb_param4, train_set = lgb_dtrain4) # 학습 진행
lgb_model4_predict = np.argmax(lgb_model4.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model4_predict) # 모델 분류 평가 결과

## 비율이 30%가 적당하다. 그럼 BLSM과 비교해보자!

# BLSM (Borderline SMOTE)
from imblearn.over_sampling import BorderlineSMOTE
sm4 = BorderlineSMOTE(random_state = 42, sampling_strategy = 0.6) # BLSM 알고리즘 적용
X_train_res4, y_train_res4 = sm4.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용

lgb_dtrain5 = lgb.Dataset(data = pd.DataFrame(X_train_res4), label = pd.DataFrame(y_train_res4)) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param5 = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 50, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(pd.DataFrame(y_train_res4))) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model5 = lgb.train(params = lgb_param5, train_set = lgb_dtrain5) # 학습 진행
lgb_model5_predict = np.argmax(lgb_model5.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model5_predict) # 모델 분류 평가 결과

# BLSM보다 기본 SMOTE가 성능이 좋다. 이를 바탕으로 다양한 모델에 적용
- 선형회귀(로지스틱), Random Forest, CatBoost

# BLSM을 이용해서 Oversampling 한 학습 데이터 셋 : X_train_res2, y_train_res2