Exemplo n.º 1
0
def runBestClassificationKFold(dataSets=[], Classifiers=[], names=[]):

    myResults = {}
    le = pre.LabelEncoder()

    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesBinClass')
        le.fit(myVal)
        myVal = le.transform(myVal)
        #myTrain = skb(f_regression, k=6).fit_transform(myTrain,myVal)
        #myTrain = skb(chi2, k=5).fit_transform(myTrain,myVal)
        splits = sss(n_splits=10,
                     test_size=((len(myData) * .20) / len(myData)),
                     random_state=42)
        #splits =kf(n_splits=10, shuffle=True, random_state=42)
        infinity = -1.0 * float("inf")
        index = -1
        count = -1
        for clf in Classifiers:
            count = count + 1
            clf.fit(myTrain, myVal)
            cvsScores = cvs(clf, myTrain, myVal, cv=splits, scoring='roc_auc')
            meanAUC = cvsScores.mean()
            print(ClassifiersNames[names[count]], meanAUC)
            if (meanAUC > infinity):
                infinity = meanAUC
                index = count
                L1, L2, L3 = ClassifiersNames[
                    names[index]], cvsScores, infinity
        print(filesBinClass[ds], ClassifiersNames[names[index]], infinity)
        myResults[filesBinClass[ds]] = {1: L1, 2: L2, 3: L3}
        print('\n')
    return myResults
Exemplo n.º 2
0
def cv_clf(x,
           y,
           test_size=0.2,
           n_splits=5,
           random_state=None,
           doesUpsample=True):
    sss_obj = sss(n_splits, test_size, random_state=random_state).split(x, y)
    if not doesUpsample: yield sss_obj
    for train_inds, valid_inds in sss_obj:
        yield (upsample_indices_clf(train_inds, y[train_inds]), valid_inds)
Exemplo n.º 3
0
def cv_clf(x,
           y,
           test_size=0.2,
           n_splits=5,
           random_state=None,
           doesUpsample=True):
    #splitter  = TimeSeriesSplit(n_splits=n_splits, max_train_size=None).split(x)
    splitter = sss(n_splits=n_splits,
                   test_size=test_size,
                   random_state=random_state).split(x, y)
    if not doesUpsample:
        yield splitter
    for train_index, test_index in splitter:  #for train_index, test_index in sss.split(X, y):
        #for train_index, test_index in tscv.split(X):
        yield (upsample_indices_clf(train_index, y[train_index]), test_index)
Exemplo n.º 4
0
def cv_clf(x, y,
           test_size = 0.2, n_splits = 5, random_state=None,
           doesUpsample = True):
    """
    an iterator of cross-validation groups with upsampling
    :param x:
    :param y:
    :param test_size:
    :param n_splits:
    :return:
    """

    sss_obj = sss(n_splits, test_size, random_state=random_state).split(x, y)

    # no upsampling needed
    if not doesUpsample:
        return sss_obj

    # with upsampling
    for train_inds, valid_inds in sss_obj:
        yield (upsample_indices_clf(train_inds, y[train_inds]), valid_inds)
Exemplo n.º 5
0
    def __init__(self,
                 Cs=500,
                 cv=10,
                 sampler='skf',
                 solver='liblinear',
                 **kwargs):

        super(self.__class__, self).__init__()

        self.penalty = 'l1'
        self.solver = solver
        self.Cs = Cs
        self.sampler = sampler
        self.cv_folds = cv

        if self.sampler == 'skf':
            self.cv = skf(n_splits=self.cv_folds)

        elif self.sampler == 'sss':
            self.cv = sss(n_splits=self.cv_folds)

        elif self.sampler == 'kf':
            self.cv = kf(n_splits=self.cv_folds)

        elif self.sampler == 'ss':
            self.cv = ss(n_splits=self.cv_folds)

        else:
            raise (Exception(
                'Selected sampler is not a valid. Please choose '
                '"skf" for stratified K-fold or "sss" for '
                'stratified shuffle split. Also "sk" and "ss" for '
                'the respective non-stratified methods.'))

        for k, v in kwargs.items():
            setattr(self, k, v)

        self.x = None
        self.y = None
Exemplo n.º 6
0
sns.heatmap(bank.corr())

#dummy
dummy = pd.get_dummies(bank.loc[:,['Geography','Gender']], drop_first=True)  #drop_first : to prevent dummy varible trap, model has understand skip varible from data
bank.drop(['Geography','Gender'], axis=1, inplace= True)
new_bank = pd.concat([bank,dummy],axis=1)
new_bank.head()
new_bank.dtypes

# Data completly overlapped to apply model KNN and RandomForest , xgboost

# Training and Testing data: stratified sampling 
# beacause of y has catogorical to make sense of do stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit as sss
split = sss(n_splits = 5, test_size = 0.2 , random_state = 42)
for train_index , test_index in split.split(new_bank, new_bank['Exited']):
    bank_train = new_bank.loc[train_index]
    bank_test = new_bank.loc[test_index]

y = bank_train['Exited']
X = bank_train.drop(['Exited'], axis = 1)

y_t = bank_test['Exited']
X_t = bank_test.drop(['Exited'], axis = 1)

#----------------------------KNN----------------------------------------------
#KNN : nearest neighbor , new data points prediction happens to find nearest one (depend upon k) and 
#depend upon majority or their is amibiguty (selection based on distance) likewies new data points assign that class/category

from sklearn.neighbors import KNeighborsClassifier as KNC
def cross_validation(training_data, kfolds, model, model_name, verbose=False):
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response

    cv = sss(n_splits=kfolds)
    acc_list = []
    bal_acc_list = []
    prec_list = []
    prec0_list = []
    rec_list = []
    spec_list = []
    f1_list = []
    f1w_list = []
    g_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[
            prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[
            prim_val_index]

        m = model.fit(X_training, y_training)
        yhat = m.predict(X_prim_val)

        score_table = metric_scores(y_prim_val, yhat)
        acc_list.append(score_table['accuracy'])
        bal_acc_list.append(score_table['balanced_accuracy'])
        prec_list.append(score_table['precision'])
        prec0_list.append(score_table['precision_0'])
        rec_list.append(score_table['recall'])
        spec_list.append(score_table['specificity'])
        f1_list.append(score_table['F1'])
        f1w_list.append(score_table['F1_weighted'])
        g_list.append(score_table['G_mean'])

    acc_pred = np.round(np.mean(acc_list), 4).astype(str) + '+/-' + np.round(
        np.std(acc_list), 4).astype(str)
    bal_acc_pred = np.round(np.mean(bal_acc_list),
                            4).astype(str) + '+/-' + np.round(
                                np.std(bal_acc_list), 4).astype(str)
    prec_pred = np.round(np.mean(prec_list), 4).astype(str) + '+/-' + np.round(
        np.std(prec_list), 4).astype(str)
    prec0_pred = np.round(np.mean(prec0_list),
                          4).astype(str) + '+/-' + np.round(
                              np.std(prec0_list), 4).astype(str)
    rec_pred = np.round(np.mean(rec_list), 4).astype(str) + '+/-' + np.round(
        np.std(rec_list), 4).astype(str)
    spec_pred = np.round(np.mean(spec_list), 4).astype(str) + '+/-' + np.round(
        np.std(spec_list), 4).astype(str)
    f1_pred = np.round(np.mean(f1_list), 4).astype(str) + '+/-' + np.round(
        np.std(f1_list), 4).astype(str)
    f1w_pred = np.round(np.mean(f1w_list), 4).astype(str) + '+/-' + np.round(
        np.std(f1w_list), 4).astype(str)
    g_pred = np.round(np.mean(g_list), 4).astype(str) + '+/-' + np.round(
        np.std(g_list), 4).astype(str)
    return pd.DataFrame(
        {
            'Model name': model_name,
            'accuracy': acc_pred,
            'balanced_accuracy': bal_acc_pred,
            'precision': prec_pred,
            'precision_0': prec0_pred,
            'recall': rec_pred,
            'specificity': spec_pred,
            'F1': f1_pred,
            'F1_weighted': f1w_pred,
            'G_mean': g_pred
        },
        index=[0])
Exemplo n.º 8
0
# In[49]:

pd.plotting.scatter_matrix(pokemon, figsize=(20, 20))

# In[50]:

from sklearn.model_selection import StratifiedShuffleSplit as sss

# In[51]:

scaled_pokemon.__len__()

# In[52]:

k = sss(n_splits=1, test_size=0.2, train_size=0.8)

train_idx, test_idx = list(tuple(k.split(scaled_pokemon, label))[0][0]), list(
    tuple(k.split(scaled_pokemon, label))[0][1])

# In[53]:

pokemon_tensor = torch.utils.data.TensorDataset(
    torch.tensor(np.array(scaled_pokemon)), torch.tensor(np.array(label)))

# In[54]:


def split_data(datasets,
               train_idx,
               test_idxsamplers=torch.utils.data.SubsetRandomSampler,