def _trn_val_split(self, split_type, split_num, cell_type):
        trn_df = pd.read_csv('./mnt/inputs/origin/train.csv.zip')

        if split_type == 'gkf':
            fold = gkf(split_num).split(trn_df['id_code'], trn_df['sirna'],
                                        trn_df['well'])
        elif split_type == 'skf':
            fold = skf(split_num, shuffle=True, random_state=71)\
                .split(trn_df['id_code'], trn_df['sirna'])
        elif split_type == 'cskf':
            fold = cskf(trn_df,
                        trn_df['sirna'],
                        split_num,
                        shuffle=True,
                        random_state=71)
        else:
            raise Exception(f'invalid split type: {split_type}')
        if cell_type not in ['ALL', 'HEPG2', 'U2OS', 'HUVEC', 'RPE']:
            raise Exception(f'invalid cell type {cell_type}')
        for trn_idx, val_idx in fold:
            if cell_type != 'ALL':
                _trn_df = trn_df.iloc[trn_idx]
                trn_ids = _trn_df[_trn_df.experiment.str.contains(
                    cell_type)].id_code
                _val_df = trn_df.iloc[val_idx]
                val_ids = _val_df[_val_df.experiment.str.contains(
                    cell_type)].id_code
            else:
                trn_ids = trn_df.iloc[trn_idx].id_code
                val_ids = trn_df.iloc[val_idx].id_code
            break
        return trn_ids, val_ids
예제 #2
0
def perform_grid_search(train_csv_path, headers, num_heroes):
    df = pd.read_csv(train_csv_path, names=headers, nrows=10000)
    print('Number of observations in the training data:', len(df))

    enhanced_features = enhance_features(headers, df, None, num_heroes)
    combined_features = enhanced_features + headers[1:4]

    tuned_parameters = {
        'n_estimators': [50, 100],
        'max_depth': [6, 8],
        'subsample': [0.5],
        'learning_rate': [0.01, 0.05]
    }
    splitter = skf(5, shuffle=True, random_state=0)
    clf = gscv(gdc(), tuned_parameters, cv=splitter, n_jobs=-1)
    clf.fit(df[combined_features], df['score'])

    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in sorted(zip(means, stds,
                                        clf.cv_results_['params'])):
        print("%0.4f (+/-%0.04f) for %r" % (mean, std * 2, params))

    print()
    print('Best score: ' + '\x1b[1;33;40m', clf.best_score_, '\x1b[0m')
    print('Best parameters set found on development set:')
    print()
    print(clf.best_params_)
예제 #3
0
 def split(self, x, y, group=None):
     if self.split_type == 'skf':
         if group is not None:
             self.logger.warn('the group is set for skf, '
                              'which is not used.')
         fold = skf(self.split_num,
                    shuffle=self.shuffle,
                    random_state=self.random_state).split(x, y)
     elif self.split_type == 'gkf':
         fold = gkf(self.split_num).split(x, y, group)
     elif self.split_type == 'abhishek5':
         fold = []
         fold_df = pd.read_csv(self.abhishek5)
         for i in range(5):
             fold.append((
                 fold_df.query(f'kfold != {i}').index.tolist(),
                 fold_df.query(f'kfold == {i}').index.tolist(),
             ))
     elif self.split_type == 'abhishek8':
         fold = []
         fold_df = pd.read_csv(self.abhishek8)
         for i in range(8):
             fold.append((
                 fold_df.query(f'kfold != {i}').index.tolist(),
                 fold_df.query(f'kfold == {i}').index.tolist(),
             ))
     else:
         raise NotImplementedError(f'split_type: {self.split_type}')
     return fold
def CellwiseStratifiedKFold(X_df,
                            y,
                            n_splits=5,
                            shuffle=False,
                            random_state=71):
    cells = X_df.experiment.apply(lambda x: x.split('-')[0])
    cell_folds = []
    whole_index = np.array([i for i in range(len(X_df))])
    cell_whole_indexes = []
    for cell in np.unique(cells):
        cell_df = X_df[cells == cell]
        cell_whole_index = whole_index[cells == cell]
        cell_whole_indexes.append(cell_whole_index)
        cell_y = y[cells == cell]
        if cell_y.value_counts().min() < n_splits:
            cell_fold = skf(n_splits=int(cell_y.value_counts().min()),
                            shuffle=shuffle,
                            random_state=random_state).split(cell_df, cell_y)
        else:
            cell_fold = skf(n_splits=n_splits,
                            shuffle=shuffle,
                            random_state=random_state).split(cell_df, cell_y)
        cell_folds.append(cell_fold)

    fold = [[[], []] for i in range(n_splits)]
    for cell_whole_index, cell_fold in zip(cell_whole_indexes, cell_folds):
        for i, (trn_idx, val_idx) in enumerate(cell_fold):
            if i > 2:
                break
            fold[i][0].append(cell_whole_index[trn_idx])
            fold[i][1].append(cell_whole_index[val_idx])
    for i, _ in enumerate(fold):
        if len(fold[i][0]) > 0:
            fold[i][0] = np.concatenate(fold[i][0])
            fold[i][1] = np.concatenate(fold[i][1])

    return fold
예제 #5
0
def test(emb, label_mat, emb_IDmap, label_IDmap, n_splits, random_state,
         shuffle):
    """Test embedding performance

     Perform node classification using L2 regularized Logistic Regression 
     with 5-Fold Cross Validation

    """
    n_classes = label_mat.shape[1]
    label_IDs = list(label_IDmap)
    emb_idx = [emb_IDmap[ID] for ID in label_IDs]
    x = emb[emb_idx]

    splitter = skf(n_splits=n_splits,
                   random_state=random_state,
                   shuffle=shuffle)
    mdl = LogReg(penalty='l2', solver='lbfgs', warm_start=False, max_iter=1000)

    y_true_all = []
    y_pred_all = []

    for i in range(n_classes):
        y = label_mat[:, i]
        label = i + 1

        y_true = np.array([], dtype=bool)
        y_pred = np.array([])

        for j, (train, test) in enumerate(splitter.split(y, y)):
            print("Testing class #{:>4d},\tfold {:>2d} / {:<2d}".format(
                label, j + 1, n_splits),
                  flush=True,
                  end='\r')
            mdl.fit(x[train], y[train])

            y_true = np.append(y_true, y[test])
            y_pred = np.append(y_pred, mdl.decision_function(x[test]))

        y_true_all.append(y_true)
        y_pred_all.append(y_pred)

    print('')

    return y_true_all, y_pred_all
 def _trn_val_split(self, split_type, split_num):
     trn_df = pd.read_csv('./mnt/inputs/origin/train.csv.zip')
     if split_type == 'gkf':
         fold = gkf(split_num).split(trn_df['id_code'], trn_df['sirna'],
                                     trn_df['well'])
     elif split_type == 'skf':
         fold = skf(split_num, shuffle=True, random_state=71)\
             .split(trn_df['id_code'], trn_df['sirna'])
     elif split_type == 'cskf':
         fold = cskf(trn_df,
                     trn_df['sirna'],
                     split_num,
                     shuffle=True,
                     random_state=71)
     else:
         raise Exception(f'invalid split type: {split_type}')
     for trn_idx, val_idx in fold:
         trn_ids = trn_df.iloc[trn_idx].id_code
         val_ids = trn_df.iloc[val_idx].id_code
         break
     return trn_ids, val_ids
예제 #7
0
    def __init__(self,
                 Cs=500,
                 cv=10,
                 sampler='skf',
                 solver='liblinear',
                 **kwargs):

        super(self.__class__, self).__init__()

        self.penalty = 'l1'
        self.solver = solver
        self.Cs = Cs
        self.sampler = sampler
        self.cv_folds = cv

        if self.sampler == 'skf':
            self.cv = skf(n_splits=self.cv_folds)

        elif self.sampler == 'sss':
            self.cv = sss(n_splits=self.cv_folds)

        elif self.sampler == 'kf':
            self.cv = kf(n_splits=self.cv_folds)

        elif self.sampler == 'ss':
            self.cv = ss(n_splits=self.cv_folds)

        else:
            raise (Exception(
                'Selected sampler is not a valid. Please choose '
                '"skf" for stratified K-fold or "sss" for '
                'stratified shuffle split. Also "sk" and "ss" for '
                'the respective non-stratified methods.'))

        for k, v in kwargs.items():
            setattr(self, k, v)

        self.x = None
        self.y = None
예제 #8
0
                                     criterion='entropy',
                                     max_depth=20,
                                     max_leaf_nodes=50,
                                     n_jobs=-1,
                                     random_state=1)
final_model.fit(x_train, y_train)
y_pred = final_model.predict(x_test)

# k-fold cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold as skf
score = cross_val_score(estimator=final_model,
                        scoring='f1',
                        X=x_train,
                        y=y_train,
                        cv=skf(n_splits=10))
f1 = score.mean()
#############################################################################
# grid search
from sklearn.model_selection import GridSearchCV

parameters = [{
    'n_estimators': [50, 100, 150, 250],
    'max_depth': [10, 20, 40],
    'max_features': (5, 10, 20, 30),
    'max_leaf_nodes': [20, 40, 60]
}]

grid_search = GridSearchCV(iid=False,
                           estimator=final_model,
                           param_grid=parameters,
예제 #9
0
def use_splitted_data_set(train, combined_features):
    splitter = skf(5, shuffle=True, random_state=0)
    sp_iter = splitter.split(train[combined_features], train['score'])
    chosen_index, _ = next(sp_iter)
    return train.iloc[chosen_index]
예제 #10
0
                               max_leaf_nodes=50,
                               n_jobs=-1,
                               random_state=1)
model.fit(X1, y1)
y1_pred = model.predict(X1_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

cm = confusion_matrix(y1, model.predict(X1))
accuracy_0 = accuracy_score(y1_test, y1_pred)
f1_0 = f1_score(y1_test, y1_pred)

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold as skf

score_base = cross_val_score(estimator=model, X=X1, y=y1, cv=skf(n_splits=20))
accuracy_base = score_base.mean()

pred_0 = model.predict(x_pred)
output_0 = pd.DataFrame({
    'PassengerId': da_test.PassengerId,
    'Survived': pred_0
})
output_0.to_csv('my_submission_base.csv', index=False)

##### (SVC) #####
from sklearn.svm import SVC

classifier_svc = SVC(kernel='rbf', gamma='auto')
classifier_svc.fit(x_train, y_train)
y_pred_svc = classifier_svc.predict(x_test)