示例#1
0
def test_svm_smote():
    svm_smote = SVMSMOTE(random_state=42)
    svm_smote_nn = SVMSMOTE(random_state=42,
                            k_neighbors=NearestNeighbors(n_neighbors=6),
                            m_neighbors=NearestNeighbors(n_neighbors=11),
                            svm_estimator=SVC(random_state=42))

    X_res_1, y_res_1 = svm_smote.fit_sample(X, Y)
    X_res_2, y_res_2 = svm_smote_nn.fit_sample(X, Y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
示例#2
0
    def create_model_from_training_data(self):
        training_comments = []
        training_ratings = []
        print("Training classifier model..")
        for sentidata in self.training_data:
            comments = preprocess_text(sentidata.text)
            training_comments.append(comments)
            training_ratings.append(sentidata.rating)

        # discard stopwords, apply stemming, and discard words present in less than 3 comments
        self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem,
                                          sublinear_tf=True,
                                          max_df=0.5,
                                          stop_words=mystop_words,
                                          min_df=3)
        X_train = self.vectorizer.fit_transform(training_comments).toarray()
        Y_train = np.array(training_ratings)

        #Apply SMOTE to improve ratio of the minority class
        smote_model = SVMSMOTE(sampling_strategy=0.5,
                               random_state=None,
                               k_neighbors=15,
                               m_neighbors=15,
                               out_step=.0001,
                               svm_estimator=None,
                               n_jobs=1)

        X_resampled, Y_resampled = smote_model.fit_sample(X_train, Y_train)

        model = self.get_classifier()
        model.fit(X_resampled, Y_resampled)

        return model
示例#3
0
def getData(splitData=True, useImbalancer=False, useStratify=False):
    global standard_scaler
    data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv")
    X = data.values[:, 1:-1]
    rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy()
    X = np.concatenate((X, rank_dummy), axis=1)
    y = data.values[:, 0].reshape(-1, 1)
    if useStratify:
        stratify = y
    else:
        stratify = None
    if splitData:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=101,
                                                            shuffle=True,
                                                            stratify=stratify)
    else:
        X_train = X
        y_train = y
    if useImbalancer and splitData:
        tl = TomekLinks(sampling_strategy='majority')
        X_train, y_train = tl.fit_sample(X=X_train, y=y_train)
        # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T)
    if splitData:
        unique, counts = np.unique(y_test, return_counts=True)
    # print("y_test\n", np.asarray((unique, counts)).T)
    if splitData:
        return X_train, X_test, y_train.ravel(), y_test.ravel()
    else:
        return X_train, y_train.ravel()
示例#4
0
def roc_curves(df, number_of_matches):
    number_of_matches = int(number_of_matches)
    df_played_matches = df.iloc[0:number_of_matches-1]
    classifier = LogisticRegression(max_iter=300, multi_class = 'multinomial', solver = 'saga',penalty='elasticnet',l1_ratio = .95)
    classifier = OneVsRestClassifier(classifier)
    count = 0
    Data = df_played_matches[['home_pos', 'visitor_pos', 'spi1', 'spi2', 'draw%', 'home_form', 'visitor_form', 'importance1', 'importance2', 'xG1', 'xG2']]
    Target = df_played_matches['home_result']
    y = np.asarray(Target)
    enc = LabelEncoder()
    label_encoder = enc.fit(y)
    y = label_encoder.transform(y)
    X = np.asarray(Data)
    n_classes = 3
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=5)
    from imblearn.over_sampling import SVMSMOTE
    SVMSMOTE = SVMSMOTE()
    columns = Data.columns
    up_sampled_X,up_sampled_y=SVMSMOTE.fit_sample(X_train, y_train)
    up_sampled_X = pd.DataFrame(data=up_sampled_X,columns=columns )
    up_sampled_y= pd.DataFrame(data=up_sampled_y,columns=['home_result'])

    scaler = RobustScaler()
    scaler.fit(up_sampled_X)
    X_train = scaler.transform(up_sampled_X)
    X_test = scaler.transform(X_test)

    y_train = label_binarize(np.asarray(up_sampled_y), classes=[0, 1, 2])
    y_test = label_binarize(np.asarray(y_test), classes=[0, 1, 2])
    y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    data = [fpr[2], tpr[2]]
    dataset = pd.DataFrame({'FPR': data[0], 'TPR': data[1]})
    dataset.to_csv("reticulate1.csv")
def oversampling(X_i, X_p1, X_p2, y_i, strategy='auto', ratio=None, sampling_type='svm'): 
    '''
    ratio = # majority class instances / # minority class instances
    
    Parameters
    ----------
    
    Returns
    ----------
    
    '''
    # Initialize and join 3 matrices
    _,ei = X_i.shape
    _,e1 = X_p1.shape
    _,e2 = X_p2.shape
    X_total = np.concatenate((X_i, X_p1, X_p2), axis = 1)
    
    # Over sampling
    if sampling_type=='svm':
        from imblearn.over_sampling import SVMSMOTE
      
        if ratio == None:
            os = SVMSMOTE(random_state=0)
        else:
            os = SVMSMOTE(sampling_strategy=1/ratio, random_state=0)
    else:
        from imblearn.over_sampling import RandomOverSampler
        
        if ratio == None:
            os = RandomOverSampler(random_state=0)
        else:
            os = RandomOverSampler(sampling_strategy=1/ratio, random_state=0)
    
    X_res, y_res = os.fit_sample(X_total, y_i)
    
    # Separate in 3 matrices
    X_i_res = X_res[:,0:ei]
    X_p1_res = X_res[:,ei:ei+e1]
    X_p2_res = X_res[:,ei+e1:]
    
    return X_i_res, X_p1_res, X_p2_res, y_res, X_res
示例#6
0
def _SMOTE_SVM(self):
    # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique
    # print('before SMOTE df', self.x_train)
    print("before SMOTE df", self.x_train.shape)
    smote = SVMSMOTE(
        k_neighbors=5, m_neighbors=5, random_state=self.seed
    )  # sampling_strategy=0.8
    self.X_train_smote, self.y_train_smote = smote.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    # print('len smote: \n', len(self.X_train_smote))
    print("len new x_train after smote: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))
def train_models(pos="F", adj=1, clusters=0, classifier=2, imb=1):
    '''
    train models
    :param pos: position (F/D)
    :param adj: apply league/era adjustments
    :param clusters: level of clustering to include
    :param classifier: which classifier to predict
    :param imb: 1 to apply oversampling, 2 to apply undersampling
    :return:
    '''
    import warnings
    warnings.simplefilter(action='ignore', category=Warning)

    conn = sqlite3.connect('nhl_draft.db')

    X_train_original = pd.read_sql_query(
        '''SELECT * FROM X_TRAIN{}{}'''.format(str(classifier), pos), conn)
    X_test_original = pd.read_sql_query(
        '''SELECT * FROM X_TEST{}{}'''.format(str(classifier), pos), conn)
    y_train = pd.read_sql_query(
        '''SELECT * FROM Y_TRAIN{}{}'''.format(str(classifier), pos), conn)
    y_test = pd.read_sql_query(
        '''SELECT * FROM Y_TEST{}{}'''.format(str(classifier), pos), conn)

    if adj == 1:
        X_train = X_train_original[[
            i for i in X_train_original.columns
            if i not in ['g_gp17', 'a_gp17', 'g_gp18', 'a_gp18']
        ]]
        X_test = X_test_original[[
            i for i in X_test_original.columns
            if i not in ['g_gp17', 'a_gp17', 'g_gp18', 'a_gp18']
        ]]
    else:
        X_train = X_train_original[[
            i for i in X_train_original.columns if i not in [
                'adj_p_a17', 'adj_p_a18', 'adj_g_a17', 'adj_g_a18',
                'adj_a_a17', 'adj_a_a18'
            ]
        ]]
        X_test = X_test_original[[
            i for i in X_test_original.columns if i not in [
                'adj_p_a17', 'adj_p_a18', 'adj_g_a17', 'adj_g_a18',
                'adj_a_a17', 'adj_a_a18'
            ]
        ]]

    if clusters == 1:
        X_train = X_train[[
            i for i in X_train.columns
            if i not in ['clusters100', 'clusters200']
        ]]
        X_test = X_test[[
            i for i in X_test.columns
            if i not in ['clusters100', 'clusters200']
        ]]
    elif clusters == 2:
        X_train = X_train[[
            i for i in X_train.columns
            if i not in ['clusters50', 'clusters200']
        ]]
        X_test = X_test[[
            i for i in X_test.columns
            if i not in ['clusters50', 'clusters200']
        ]]
    elif clusters == 3:
        X_train = X_train[[
            i for i in X_train.columns
            if i not in ['clusters50', 'clusters100']
        ]]
        X_test = X_test[[
            i for i in X_test.columns
            if i not in ['clusters50', 'clusters100']
        ]]
    else:
        X_train = X_train[[
            i for i in X_train.columns
            if i not in ['clusters50', 'clusters100', 'clusters200']
        ]]
        X_test = X_test[[
            i for i in X_test.columns
            if i not in ['clusters50', 'clusters100', 'clusters200']
        ]]

    X_train, X_test = one_hot_encoding(X_train, X_test, clusters)
    X_train = X_train[[
        i for i in X_train.columns if i not in
        ['player_id', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18']
    ]]
    X_test = X_test[[
        i for i in X_test.columns if i not in
        ['player_id', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18']
    ]]

    X_train, X_test, selected_feat = select_features(
        X_train, y_train, X_test, GradientBoostingClassifier(), threshold=0.01)

    # models = [KNeighborsClassifier()]#, MLPClassifier(), SVC(), RandomForestClassifier(), GaussianNB(), KNeighborsClassifier()]
    models = [
        VotingClassifier(estimators=[('gbc', GradientBoostingClassifier()),
                                     ('gnb', GaussianNB())],
                         voting='soft')
    ]
    for model in models:
        print(type(model).__name__)
        if imb == 1:
            # SMOTE, SMOTENC, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE, RandomOverSampler
            smt = SVMSMOTE()
        elif imb == 2:
            smt = NearMiss()
        cv = StratifiedKFold(n_splits=3)
        for train_idx, test_idx, in cv.split(X_train, y_train):
            X_train1, X_test1 = X_train[train_idx], X_train[test_idx]
            try:
                y_train1, y_test1 = y_train.loc[train_idx], y_train.loc[
                    test_idx]
            except:
                y_train1, y_test1 = y_train[train_idx], y_train[test_idx]
            if imb != 0:
                X_train1, y_train1 = smt.fit_sample(X_train1, y_train1)
            clf = model
            clf.fit(X_train1, y_train1)
            y_train_pred = clf.predict(X_test1)
            print(confusion_matrix(y_test1, y_train_pred))
            print("Train f1: {}".format(f1_score(y_test1, y_train_pred)))
            print("Train Precision:", precision_score(y_test1, y_train_pred))
            print("Train Recall:", recall_score(y_test1, y_train_pred))
            print("Train Accuracy:", accuracy_score(y_test1, y_train_pred))
        if imb != 0:
            X_train, y_train = smt.fit_sample(X_train, y_train)
        clf = model
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        # y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
        print(confusion_matrix(y_test, y_pred))
        print("Test f1: {}".format(f1_score(y_test, y_pred)))
        print("Test Precision:", precision_score(y_test, y_pred))
        print("Test Recall:", recall_score(y_test, y_pred))
        print("Test Accuracy:", accuracy_score(y_test, y_pred))
        print()

        print(selected_feat)
        try:
            print(clf.feature_importances_)
        except:
            pass
        clf.feature_names = selected_feat
        # save_model(clf, 'models/clf{}_{}2.sav'.format(classifier, pos.lower()))

    return
    adaBoost = AdaBoostClassifier(random_state=0)
    adaBoost.fit(X_train, y_train)
    res = adaBoost.predict(features[test_index])
    km_scores['AB'] += metrics.f1_score(res, target[test_index])
    km_con_mat['AB'] += confusion_matrix(y_true=target[test_index], y_pred=res)

    # Gradient Boost Classifier
    gradBoost = GradientBoostingClassifier(random_state=0)
    gradBoost.fit(X_train, y_train)
    res = gradBoost.predict(features[test_index])
    km_scores['GB'] += metrics.f1_score(res, target[test_index])
    km_con_mat['GB'] += confusion_matrix(y_true=target[test_index], y_pred=res)

    # SVM Smote
    svm_smote = SVMSMOTE(random_state=0)
    X_train, y_train = svm_smote.fit_sample(features[train_index],
                                            target[train_index])

    # Logistic Regression
    logistic = LogisticRegression(random_state=0)
    logistic.fit(X_train, y_train)
    res = logistic.predict(features[test_index])
    svm_sm_scores['LR'] += metrics.f1_score(res, target[test_index])
    svm_sm_con_mat['LR'] += confusion_matrix(y_true=target[test_index],
                                             y_pred=res)
    #
    # Ada Boost Classifier
    adaBoost = AdaBoostClassifier(random_state=0)
    adaBoost.fit(X_train, y_train)
    res = adaBoost.predict(features[test_index])
    svm_sm_scores['AB'] += metrics.f1_score(res, target[test_index])
    svm_sm_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
示例#9
0
    def runSMOTEvariationsGen(self, folder):
        """
        Create files with SMOTE preprocessing and without preprocessing.
        :param datasets: datasets.
        :param folder:   cross-validation folders.
        :return:
        """
        smote = SMOTE()
        borderline1 = BorderlineSMOTE(kind='borderline-1')
        borderline2 = BorderlineSMOTE(kind='borderline-2')
        smoteSVM = SVMSMOTE()
        geometric_smote = GeometricSMOTE(n_jobs=-1)

        for dataset in datasets:  # biclass e multiclass
            for fold in range(5):
                path = os.path.join(folder, dataset, str(fold),
                                    ''.join([dataset, "_train.csv"]))
                train = np.genfromtxt(path, delimiter=',')
                X = train[:, 0:train.shape[1] - 1]
                Y = train[:, train.shape[1] - 1]

                # SMOTE
                print("SMOTE..." + dataset)
                X_res, y_res = smote.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(folder, dataset, str(fold),
                                             ''.join([dataset, "_SMOTE.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE1
                print("Borderline1..." + dataset)
                X_res, y_res = borderline1.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline1.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE2
                print("Borderline2..." + dataset)
                X_res, y_res = borderline2.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline2.csv"])),
                                header=False,
                                index=False)
                # SMOTE SVM
                print("SMOTE SVM..." + dataset)
                X_res, y_res = smoteSVM.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_smoteSVM.csv"])),
                                header=False,
                                index=False)

                # GEOMETRIC SMOTE
                print("GEOMETRIC SMOTE..." + dataset)
                X_res, y_res = geometric_smote.fit_resample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Geometric_SMOTE.csv"])),
                                header=False,
                                index=False)
# Unscaled Features
X = data_with_targets.drop([target_variable], axis=1)

# Target Variable
y = data_with_targets[target_variable]

#####################################################
### SMOTE Sampling to deal with imbalance classes ###
#####################################################

# Setting Seed Value
seed = 81

smote = SVMSMOTE(random_state=seed)

resampled_X, resampled_y = smote.fit_sample(X, y)

##########################################
### Splitting into Train and Test sets ###
##########################################

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    resampled_X,
    resampled_y,
    test_size=0.3,
    stratify=resampled_y,
    random_state=seed)

####################################
### Scalling Train and Test sets ###
####################################
示例#11
0
def upsampleSvmSmote(params, X, y):
    svmsmote = SVMSMOTE(**params)
    X_rs, y_rs = svmsmote.fit_sample(X, y)
    return X_rs, y_rs
示例#12
0
# BLSM (Borderline SMOTE)
from imblearn.over_sampling import BorderlineSMOTE
sm2 = BorderlineSMOTE(random_state = 42) # BLSM 알고리즘 적용
X_train_res2, y_train_res2 = sm2.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용
lgbm_clf3 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier
lgbm_clf3.fit(X_train_res2, y_train_res2) # 학습 진행
y_pred3 = lgbm_clf3.predict(X_test) # 평가 데이터셋 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred3)) # 혼돈행렬
print('\n')
print("Model Evaluation Result: \n", classification_report(y_test, y_pred3)) # 전체적인 성능 평가

# SVMSMOTE
from imblearn.over_sampling import SVMSMOTE
sm3 = SVMSMOTE(random_state = 42) # SVMSMOTE 알고리즘 적용
X_train_res3, y_train_res3 = sm3.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용
lgbm_clf4 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier
lgbm_clf4.fit(X_train_res3, y_train_res3) # 학습 진행
y_pred4 = lgbm_clf4.predict(X_test) # 평가 데이터셋 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred4)) # 혼돈행렬
print('\n')
print("Model Evaluation Result: \n", classification_report(y_test, y_pred4)) # 전체적인 성능 평가

# BLSM을 이용해서 Oversampling 한 학습 데이터 셋 : X_train_res2, y_train_res2
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C = 1e+10) 
# sklearn 의 Logistic Regression은 기본적으로 Ridge 정규화가 포함되어 있기 때문에, 
# 정규화 텀을 억제하는 C를 크게 적용한다 (C:Inverse of regularization strength)
lr_model.fit(X_train_res2, y_train_res2) # 로지스틱 회귀 모형 학습
lr_predict = lr_model.predict(X_test) # 학습 결과를 바탕으로 검증 데이터를 예측
print("Confusion_Matrix: \n", confusion_matrix(y_test, lr_predict)) # 혼돈행렬
# - On these **97 examples** we'll perform **SVMSMOTE** and **train a statistical model.** That model will **predict** on this **validation set.**
#
#
# - We've **dedicated module** to work on **Imbalanced dataset.** By **imblearn's SVMSMOTE** it becomes easy to perform **SVMSMOTE.**
#
#
# - We will use **"not majority"** as our **sampling strategy** parameter as it will **not sample majority** but all **minority classes.** We've to do this in case of **multi-class Imbalanced data.**

# In[68]:

#Applying SVMSMOTE

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE

smote = SVMSMOTE(sampling_strategy='not majority')
x_s_res, y_s_res = smote.fit_sample(x_train, y_train)

print(y_train.value_counts(), '\n')
np.bincount(y_s_res)

# - We can examine above that :<b>
#     - Class 1 : 35 Samples
#     - Class 2 : 22 Samples
#     - Class 3 : 22 Samples
#     - Class 0 : 08 Samples
#     - Class 4 : 06 Samples
#
#
# - And After **SVMSMOTE application** we get **26 Samples of one Class & 36 Samples of rest of the classes.**

# ### Building a Statistical model using SVM Classifier
X_train = df_train[[
    'home_pos', 'visitor_pos', 'spi1', 'spi2', 'win%', 'loss%', 'importance1',
    'importance2', 'xG1', 'xG2'
]]
y_train = df_train['home_result']
X_test = df_test[[
    'home_pos', 'visitor_pos', 'spi1', 'spi2', 'win%', 'loss%', 'importance1',
    'importance2', 'xG1', 'xG2'
]]

from imblearn.over_sampling import SVMSMOTE

SVMSMOTE = SVMSMOTE()
columns = X_train.columns
up_sampled_X, up_sampled_y = SVMSMOTE.fit_sample(X_train, y_train)
up_sampled_X = pd.DataFrame(data=up_sampled_X, columns=columns)
up_sampled_y = pd.DataFrame(data=up_sampled_y, columns=['home_result'])

from sklearn import preprocessing

scaler = preprocessing.StandardScaler(with_mean=False)
scaler.fit(up_sampled_X)
X_train = scaler.transform(up_sampled_X)
X_test = scaler.transform(X_test)

classifier = LogisticRegression(max_iter=10000,
                                multi_class='multinomial',
                                solver='saga',
                                penalty='elasticnet',
                                l1_ratio=.5)
示例#15
0
def balance_dataset(df, dfLabel, strategy="all"):
    sm = SVMSMOTE(random_state=42, sampling_strategy=strategy)
    trainOver, labelOver = sm.fit_sample(df, dfLabel)
    return trainOver, labelOver