def runBestClassificationKFold(dataSets=[], Classifiers=[], names=[]): myResults = {} le = pre.LabelEncoder() for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesBinClass') le.fit(myVal) myVal = le.transform(myVal) #myTrain = skb(f_regression, k=6).fit_transform(myTrain,myVal) #myTrain = skb(chi2, k=5).fit_transform(myTrain,myVal) splits = sss(n_splits=10, test_size=((len(myData) * .20) / len(myData)), random_state=42) #splits =kf(n_splits=10, shuffle=True, random_state=42) infinity = -1.0 * float("inf") index = -1 count = -1 for clf in Classifiers: count = count + 1 clf.fit(myTrain, myVal) cvsScores = cvs(clf, myTrain, myVal, cv=splits, scoring='roc_auc') meanAUC = cvsScores.mean() print(ClassifiersNames[names[count]], meanAUC) if (meanAUC > infinity): infinity = meanAUC index = count L1, L2, L3 = ClassifiersNames[ names[index]], cvsScores, infinity print(filesBinClass[ds], ClassifiersNames[names[index]], infinity) myResults[filesBinClass[ds]] = {1: L1, 2: L2, 3: L3} print('\n') return myResults
def cv_clf(x, y, test_size=0.2, n_splits=5, random_state=None, doesUpsample=True): sss_obj = sss(n_splits, test_size, random_state=random_state).split(x, y) if not doesUpsample: yield sss_obj for train_inds, valid_inds in sss_obj: yield (upsample_indices_clf(train_inds, y[train_inds]), valid_inds)
def cv_clf(x, y, test_size=0.2, n_splits=5, random_state=None, doesUpsample=True): #splitter = TimeSeriesSplit(n_splits=n_splits, max_train_size=None).split(x) splitter = sss(n_splits=n_splits, test_size=test_size, random_state=random_state).split(x, y) if not doesUpsample: yield splitter for train_index, test_index in splitter: #for train_index, test_index in sss.split(X, y): #for train_index, test_index in tscv.split(X): yield (upsample_indices_clf(train_index, y[train_index]), test_index)
def cv_clf(x, y, test_size = 0.2, n_splits = 5, random_state=None, doesUpsample = True): """ an iterator of cross-validation groups with upsampling :param x: :param y: :param test_size: :param n_splits: :return: """ sss_obj = sss(n_splits, test_size, random_state=random_state).split(x, y) # no upsampling needed if not doesUpsample: return sss_obj # with upsampling for train_inds, valid_inds in sss_obj: yield (upsample_indices_clf(train_inds, y[train_inds]), valid_inds)
def __init__(self, Cs=500, cv=10, sampler='skf', solver='liblinear', **kwargs): super(self.__class__, self).__init__() self.penalty = 'l1' self.solver = solver self.Cs = Cs self.sampler = sampler self.cv_folds = cv if self.sampler == 'skf': self.cv = skf(n_splits=self.cv_folds) elif self.sampler == 'sss': self.cv = sss(n_splits=self.cv_folds) elif self.sampler == 'kf': self.cv = kf(n_splits=self.cv_folds) elif self.sampler == 'ss': self.cv = ss(n_splits=self.cv_folds) else: raise (Exception( 'Selected sampler is not a valid. Please choose ' '"skf" for stratified K-fold or "sss" for ' 'stratified shuffle split. Also "sk" and "ss" for ' 'the respective non-stratified methods.')) for k, v in kwargs.items(): setattr(self, k, v) self.x = None self.y = None
sns.heatmap(bank.corr()) #dummy dummy = pd.get_dummies(bank.loc[:,['Geography','Gender']], drop_first=True) #drop_first : to prevent dummy varible trap, model has understand skip varible from data bank.drop(['Geography','Gender'], axis=1, inplace= True) new_bank = pd.concat([bank,dummy],axis=1) new_bank.head() new_bank.dtypes # Data completly overlapped to apply model KNN and RandomForest , xgboost # Training and Testing data: stratified sampling # beacause of y has catogorical to make sense of do stratified sampling from sklearn.model_selection import StratifiedShuffleSplit as sss split = sss(n_splits = 5, test_size = 0.2 , random_state = 42) for train_index , test_index in split.split(new_bank, new_bank['Exited']): bank_train = new_bank.loc[train_index] bank_test = new_bank.loc[test_index] y = bank_train['Exited'] X = bank_train.drop(['Exited'], axis = 1) y_t = bank_test['Exited'] X_t = bank_test.drop(['Exited'], axis = 1) #----------------------------KNN---------------------------------------------- #KNN : nearest neighbor , new data points prediction happens to find nearest one (depend upon k) and #depend upon majority or their is amibiguty (selection based on distance) likewies new data points assign that class/category from sklearn.neighbors import KNeighborsClassifier as KNC
def cross_validation(training_data, kfolds, model, model_name, verbose=False): xtraining = training_data.drop(['response'], axis=1) ytraining = training_data.response cv = sss(n_splits=kfolds) acc_list = [] bal_acc_list = [] prec_list = [] prec0_list = [] rec_list = [] spec_list = [] f1_list = [] f1w_list = [] g_list = [] for train_index, prim_val_index in cv.split(xtraining, ytraining): X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[ prim_val_index] y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[ prim_val_index] m = model.fit(X_training, y_training) yhat = m.predict(X_prim_val) score_table = metric_scores(y_prim_val, yhat) acc_list.append(score_table['accuracy']) bal_acc_list.append(score_table['balanced_accuracy']) prec_list.append(score_table['precision']) prec0_list.append(score_table['precision_0']) rec_list.append(score_table['recall']) spec_list.append(score_table['specificity']) f1_list.append(score_table['F1']) f1w_list.append(score_table['F1_weighted']) g_list.append(score_table['G_mean']) acc_pred = np.round(np.mean(acc_list), 4).astype(str) + '+/-' + np.round( np.std(acc_list), 4).astype(str) bal_acc_pred = np.round(np.mean(bal_acc_list), 4).astype(str) + '+/-' + np.round( np.std(bal_acc_list), 4).astype(str) prec_pred = np.round(np.mean(prec_list), 4).astype(str) + '+/-' + np.round( np.std(prec_list), 4).astype(str) prec0_pred = np.round(np.mean(prec0_list), 4).astype(str) + '+/-' + np.round( np.std(prec0_list), 4).astype(str) rec_pred = np.round(np.mean(rec_list), 4).astype(str) + '+/-' + np.round( np.std(rec_list), 4).astype(str) spec_pred = np.round(np.mean(spec_list), 4).astype(str) + '+/-' + np.round( np.std(spec_list), 4).astype(str) f1_pred = np.round(np.mean(f1_list), 4).astype(str) + '+/-' + np.round( np.std(f1_list), 4).astype(str) f1w_pred = np.round(np.mean(f1w_list), 4).astype(str) + '+/-' + np.round( np.std(f1w_list), 4).astype(str) g_pred = np.round(np.mean(g_list), 4).astype(str) + '+/-' + np.round( np.std(g_list), 4).astype(str) return pd.DataFrame( { 'Model name': model_name, 'accuracy': acc_pred, 'balanced_accuracy': bal_acc_pred, 'precision': prec_pred, 'precision_0': prec0_pred, 'recall': rec_pred, 'specificity': spec_pred, 'F1': f1_pred, 'F1_weighted': f1w_pred, 'G_mean': g_pred }, index=[0])
# In[49]: pd.plotting.scatter_matrix(pokemon, figsize=(20, 20)) # In[50]: from sklearn.model_selection import StratifiedShuffleSplit as sss # In[51]: scaled_pokemon.__len__() # In[52]: k = sss(n_splits=1, test_size=0.2, train_size=0.8) train_idx, test_idx = list(tuple(k.split(scaled_pokemon, label))[0][0]), list( tuple(k.split(scaled_pokemon, label))[0][1]) # In[53]: pokemon_tensor = torch.utils.data.TensorDataset( torch.tensor(np.array(scaled_pokemon)), torch.tensor(np.array(label))) # In[54]: def split_data(datasets, train_idx, test_idxsamplers=torch.utils.data.SubsetRandomSampler,