예제 #1
0
def run_model_with_smote(dataset,target,model):
    try:        
        if (model.__class__ == LogisticRegression) or (model.__class__ == KNeighborsClassifier):      
            X_new = build_dataset(dataset,1)
        else:
            X_new = build_dataset(dataset) 
        sm = SMOTE(random_state =42)        
        X_smote,y_smote = sm.fit_resample(X_new,target)
        X_train,X_test,y_train,y_test = train_test_split(X_smote,y_smote,random_state = 42, test_size =0.2,stratify = y_smote)
        print('Class ratio after applyin SMOTE : \n',check_imbalance(y_smote))
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        print('===='*20)
        print(type(model))
        print('===='*20)
        print('Classification Report : \n',metrics.classification_report(y_test,y_pred))
        print('Confusion Matrix : \n',metrics.confusion_matrix(y_test,y_pred))
        print('Accuracy score: \n',metrics.accuracy_score(y_test, y_pred))
        fpr,tpr,threshold = metrics.roc_curve(y_test,y_pred)
        plt.plot(fpr, tpr)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.title('ROC curve')
        plt.show()
        print('AUC : ',metrics.roc_auc_score(y_test, y_pred))
        return model,X_train.columns.tolist()
    except Exception as e :
        print('run_model_with_smote failed : \n',str(e.message()))
예제 #2
0
 def SMOTE_sampling(self, ds):
     self.report.append('SMOTE_sampling')
     Y = ds["Response"]
     X = ds.drop(columns=["Response"])
     sm = SMOTE(random_state=self.seed)
     X_res, Y_res = sm.fit_resample(X, Y)
     sampled_ds = pd.DataFrame(X_res)
     sampled_ds['Response'] = Y_res
     # sampled_ds.index=ds.index
     sampled_ds.columns = ds.columns
     return round(sampled_ds, 2)
예제 #3
0
 def SMOTE_NC(self):
     categories = self.training.dtypes
     self.report.append('SMOTE_NC_sampling')
     Y = self.training["Response"]
     X = self.training.drop(columns=["Response"])
     x_cols = X.columns
     cat_cols = X.loc[:, self.training.dtypes == 'category'].columns
     if len(cat_cols) > 0:
         sm = SMOTENC(random_state=self.seed, categorical_features=[cat_cols.get_loc(col) for col in cat_cols])
     else:
         sm = SMOTE(random_state=self.seed)
     X_res, Y_res = sm.fit_resample(X.values, Y.values)
     sampled_ds = pd.DataFrame(X_res, columns=x_cols)
     sampled_ds['Response'] = Y_res
     # sampled_ds.index=ds.index
     self.training = sampled_ds
예제 #4
0
def boosting_with_smote(dataset,target):
    '''
    Use SMOTE to oversample minority class and find accuracy using XGBoost

    Parameters
    ----------
    dataset : Dataframe
        DESCRIPTION.
    target : Series
        DESCRIPTION.

    Returns
    -------
    None.

    '''
    try:
        xgb = XGBClassifier(random_state = 42)
        X_new = build_dataset(dataset)
        sm = SMOTE(sampling_strategy = 'minority' ,random_state = 10)
        X_smote,y_smote = sm.fit_resample(X_new, target)
        print('Shape after SMOTE : ',X_smote.shape)
        X_train,X_test,y_train,y_test = train_test_split(X_smote,y_smote,random_state = 42, test_size =0.2,stratify = y_smote)
        xgb.fit(X_train,y_train)
        y_pred = xgb.predict(X_test)
        print('===='*20)
        print(type(xgb))
        print('===='*20)
        print('Classification Report : \n',metrics.classification_report(y_test,y_pred))
        print('Confusion Matrix : \n',metrics.confusion_matrix(y_test,y_pred))
        print('Accuracy score: \n',metrics.accuracy_score(y_test, y_pred))
        fpr,tpr,threshold = metrics.roc_curve(y_test,y_pred)
        plt.plot(fpr, tpr)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.title('ROC curve')
        plt.show()
        print('AUC : ',metrics.roc_auc_score(y_test, y_pred))
    except Exception as e:
        print('boosting_with_smote failed : \n',+str(e.message()))

# # Over-Sampled Model

# ## Using SMOTE, we over-sample the minority class (MENTHLTH2 = 1) and take care to test/train split before preoceeding with re-sampling.

# In[100]:


from imblearn.over_sampling import SMOTENC

# setting up testing and training sets
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0)

sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,10,11,12,13], sampling_strategy='minority', random_state=0, k_neighbors=5)
X_train_over, y_train_over = sm.fit_resample(X_train3, y_train3)

# describes info about train and test set 
print("Number of rows/columns in X_test3 dataset: ", X_test3.shape) 
print("Number of rows/columns in y_test3 dataset: ", y_test3.shape) 
print("Number of rows/columns in X_train_over dataset: ", X_train_over.shape) 
print("Number of rows/columns in y_train_over dataset: ", y_train_over.shape) 


# In[101]:


unique, counts = np.unique(y_train3, return_counts=True)
dict(zip(unique, counts))

예제 #6
0
X = users[features].values
y = users[['sortie_client']].values.flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001)
df = pd.DataFrame(X_train,
                  columns=[
                      'mariee', 'retraite', 'a_charge', 'facture_mensuelle',
                      'telephone', 'plusieurs_numeros', 'internet',
                      'total_factures', 'contrat', 'facture_par_mail',
                      'client_depuis_mois'
                  ])
columns = df.columns

sm = SMOTE(random_state=42, sampling_strategy=1)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

nn = 0


def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    # print ("def tree({}):" .format(", " .join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
예제 #7
0
X_train['educational'] = X_train['educational'].astype(float)
X_train['home_improvement'] = X_train['home_improvement'].astype(float)
X_train['house'] = X_train['house'].astype(float)
X_train['major_purchase'] = X_train['major_purchase'].astype(float)
X_train['medical'] = X_train['medical'].astype(float)
X_train['moving'] = X_train['moving'].astype(float)
X_train['other'] = X_train['other'].astype(float)
X_train['renewable_energy'] = X_train['renewable_energy'].astype(float)
X_train['small_business'] = X_train['small_business'].astype(float)
X_train['vacation'] = X_train['vacation'].astype(float)
X_train['wedding'] = X_train['wedding'].astype(float)
X_train['w'] = X_train['w'].astype(float)

# In[119]:

X_train_smote, y_train_smote = sm.fit_resample(X_train.astype('float'),
                                               y_train)

# In[120]:

from collections import Counter
print("Before SMOTE :", Counter(y_train))
print("After SMOTE :", Counter(y_train_smote))

# # Logistic Regression after Balancing

# In[121]:

logreg = LogisticRegression()
logreg.fit(X_train_smote, y_train_smote)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
X = telcom.iloc[:, 1: 32]  # delete customer ID from X because won't use it to predict churn. 
X = X.drop(['Churn'], axis = 1) # create dataframe with only independent variables. 
X = X.astype(float)   # keep a dataframe but where all columns are floats. 
Z = X.values  # Save X as an array under the name Z
y = telcom['Churn'].values   # dependent variables, as array. 
features = [i for i in telcom.columns if i not in Id_col + target_col] # Store the name of all variables. 


# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 0.2, random_state=44)



# Implement SMOTE 
sm = SMOTENC(random_state=40, categorical_features=categorical_features)
X_smote, y_smote = sm.fit_resample(X_train, y_train)

# Train the model on the resampled training dataset. No more unbalanced classes. 
X_train = X_smote
y_train = y_smote





####### MODEL IMPLEMENTATION


# Fitting tuned XGBoost to the Training set
classifier = XGBClassifier( n_estimators=150, learning_rate=0.15, max_depth=3, min_child_weight=0.6,
                            colsample_bytree=1, subsample=1, reg_alpha=0, gamma=0,
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train], Y[test]

    num_class1, num_class2, num_class3 = Counter(y_train)[1], Counter(
        y_train)[2], Counter(y_train)[3]
    sm = SMOTE(random_state=27,
               sampling_strategy={
                   1: int(2.0 * num_class1),
                   2: int(1.6 * num_class2),
                   3: int(1.6 * num_class3)
               })
    x_train, y_train = sm.fit_resample(x_train, y_train)

    model = LGBMClassifier(random_state=27, max_depth=6, n_estimators=400)
    model.fit(x_train, y_train, categorical_feature=[1, 2, 4, 5, 11])
    preds = model.predict(x_test)
    score = f1_score(y_test, preds, average="weighted")
    scores.append(score)
    print(score)
print("Average: ", sum(scores) / len(scores))

##Make final prediction using Lightgbm

# We apply SMOTE on all classes, thus increasing total sample size of each class
# This generalizes the decision boundary
num_class1, num_class2, num_class3 = Counter(Y)[1], Counter(Y)[2], Counter(
    Y)[3]