示例#1
0
class DKULeavePGroupsOut(object):
    def __init__(self, column_name, p):
        self.column_name = column_name
        self.splitter = LeavePGroupsOut(p)
        pass

    def set_column_labels(self, column_labels):
        self.column_labels = column_labels

    def get_n_splits(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        ret = self.splitter.get_n_splits(X, y, groups_array)
        print("Will use %s splits" % ret)
        return ret

    def split(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        return self.splitter.split(X, y, groups_array)
示例#2
0
    def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None):
        """
        returns a generator with train and test set indices based on hold on
        subject out cross-validation. This is based on the LeavePGroupsOut


        Parameters
        ----------
        tcrrep : TCRrep class instance
            TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields

        Returns
        -------
        partitions : generator object BaseCrossValidator.split from sklearn

        """
        if tcrrep is None:
            tcrrep = self.tcrrep
        # unique epitope mapped to unique numbers
        encoder_epitope = preprocessing.LabelEncoder()
        encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique()))

        # `y` target vector
        y = encoder_epitope.transform(tcrrep.clone_df.epitope)

        # `X` distance matrix (metric = 'precomputed')
        X = tcrrep.paired_tcrregex

        # Cross Validation Split
        # unique subjects mapped to unique numbers
        encoder_subjects = preprocessing.LabelEncoder()
        encoder_subjects = encoder_subjects.fit(
            list(tcrrep.clone_df.subject.unique()))

        # define groups based on subject
        groups = list(encoder_subjects.transform(tcrrep.clone_df.subject))

        # Leave P Groups Out
        lpgo = LeavePGroupsOut(n_groups=1)
        lpgo.get_n_splits(X, y, groups)
        partitions = lpgo.split(X, y, groups)
        return partitions
示例#3
0
class LeavePSubjectsOut():
    def __init__(self, subjects_indexes):
        self.subjects_indexes = subjects_indexes
        self.splitter = LeavePGroupsOut(np.unique(subjects_indexes))

    def split(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.split(X, y, groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.get_n_splits(X, y, groups)
示例#4
0
class Splits():
    def __init__(self, sub_indexes, train_size=0.33, n_splits=10, mode='loso'):
        # bootstrap ou loso
        self.si = sub_indexes
        self.train_size = train_size
        self.n_splits = n_splits
        self.mode = mode
        self.create_splits()

    def create_splits(self, splits=None):

        if self.mode == 'bootstrap':
            unique = np.unique(self.si)

            rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size)
            splits = []
            for train, test in rs.split(unique):
                train = unique[train]
                test = unique[test]
                train_ = np.nonzero([x in train for x in self.si])
                test_ = np.nonzero([x in test for x in self.si])
                splits.append((train_, test_))
            self.splits = splits
            self.splitter = None
        elif self.mode == 'groupkfold':
            self.splitter = GroupKFold(n_splits=self.n_splits)
        elif self.mode == 'loso':
            self.splitter = LeaveOneGroupOut()
        elif self.mode == 'lpso':
            self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)

    def get_n_splits(self, X=None, y=None, groups=None):
        if self.splitter:
            return self.splitter.get_n_splits(X, y, groups)
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        if self.splitter:
            for i, j in self.splitter.split(X, y, groups):
                yield i, j
        else:
            for tt in self.splits:
                yield tt
    lpgo = LeavePGroupsOut(
        n_groups=N_GROUPS)  #Number of groups to leave out in the test split.
    groups = np.array(labels_df_filtered['speakers'])

    cvscores = []
    gen = lpgo.split(X_reshape, y, groups)
    for num, indices in islice(
            enumerate(gen, 1),  # index from 1 instead of 0
            START_SPEAKER,
            None):  # loop from 5th onwards
        train_idx = indices[0]
        val_idx = indices[1]

        print('   ===== Fitting CV {} out of {} ====='.format(
            num, lpgo.get_n_splits(groups=groups)))
        print("     TRAIN:", np.unique(groups[train_idx]))
        print("       VAL:", np.unique(groups[val_idx]))

        train_X = X_reshape[train_idx]
        train_y = y.iloc[train_idx]

        val_X = X_reshape[val_idx]
        val_y = y.iloc[val_idx]

        train_y = train_y.factorize(sort=True)[0]
        val_y = val_y.factorize(sort=True)[0]

        train_weight = get_sample_weight(train_y)
        val_weight = get_sample_weight(val_y)
START_SPEAKER = 0
loop = []
for i in range(N_TRAIN):
    print('======== START TRAINING {} OUT OF {} TIMES ========'.format(i+1, N_TRAIN))

    lpgo = LeavePGroupsOut(n_groups=N_GROUPS) #Number of groups to leave out in the test split.
    groups = np.array(labels_df_filtered['speakers'])
    
    cvscores = []
    gen = lpgo.split(X_reshape, y, groups)
    for num, indices in islice(enumerate(gen,1), # index from 1 instead of 0
                               START_SPEAKER, None): # loop from 5th onwards
        train_idx = indices[0]
        val_idx = indices[1]
        
        print('   ===== Fitting CV {} out of {} ====='.format(num, lpgo.get_n_splits(groups=groups)))
        print("     TRAIN:", np.unique(groups[train_idx]))
        print("       VAL:", np.unique(groups[val_idx]))
    
        
        train_X = X_reshape[train_idx]
        train_y = y.iloc[train_idx]
        
        val_X = X_reshape[val_idx]
        val_y = y.iloc[val_idx]
      
        
        train_y = train_y.factorize(sort=True)[0]
        val_y = val_y.factorize(sort=True)[0]
        
        train_weight = get_sample_weight(train_y)    
示例#7
0
outer_cv = LeavePGroupsOut(2)  # Leave-two-sudy-out
inner_cv = LeaveOneOut(
)  # do 30-fold quasi-balanced splits within the other two studies for hyperparam optimization.
clf = GridSearchCV(estimator=model,
                   param_grid=p_grid,
                   cv=inner_cv,
                   scoring="neg_mean_squared_error",
                   verbose=True,
                   return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X, groups=df.study))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X, groups=df.study))

print("model\tinner_cv mean score\touter vc score")
i = 0
for train, test in outer_cv.split(X, y, groups=df.study):
    print(test)
    group_train = groups[train]
    clf.fit(X[train], y[train], groups=group_train)

    print(
        str(clf.best_params_) + " " + str(clf.best_score_) + " " +
        str(clf.score(X[test], y[test])))

    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
#  [ 0.74285061  1.46351659]
#  [ 2.49913075  1.23133799]]
# y
# [1 0 2 0 0 1 1 2 0 2 2 1]

# --------------------------------------------------------------------------------
# Suppose example data is collected from following group-distribution

groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

# --------------------------------------------------------------------------------
# Configure model

leave_p_group_out = LeavePGroupsOut(n_groups=2)

leave_p_group_out_get_n_splits = leave_p_group_out.get_n_splits(X, y, groups)
# print('leave_p_group_out_get_n_splits',leave_p_group_out_get_n_splits)
# leave_p_group_out_get_n_splits 6

# --------------------------------------------------------------------------------
for train, test in leave_p_group_out.split(X, y, groups):

    print('train', train.shape)
    print('test', test.shape)

    print('X[test]\n', X[train])
    print('y[test]\n', y[test])
    print('train groups', np.array(groups)[train])
    print('test groups', np.array(groups)[test])
    print('')