class DKULeavePGroupsOut(object): def __init__(self, column_name, p): self.column_name = column_name self.splitter = LeavePGroupsOut(p) pass def set_column_labels(self, column_labels): self.column_labels = column_labels def get_n_splits(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] ret = self.splitter.get_n_splits(X, y, groups_array) print("Will use %s splits" % ret) return ret def split(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] return self.splitter.split(X, y, groups_array)
def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None): """ returns a generator with train and test set indices based on hold on subject out cross-validation. This is based on the LeavePGroupsOut Parameters ---------- tcrrep : TCRrep class instance TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields Returns ------- partitions : generator object BaseCrossValidator.split from sklearn """ if tcrrep is None: tcrrep = self.tcrrep # unique epitope mapped to unique numbers encoder_epitope = preprocessing.LabelEncoder() encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique())) # `y` target vector y = encoder_epitope.transform(tcrrep.clone_df.epitope) # `X` distance matrix (metric = 'precomputed') X = tcrrep.paired_tcrregex # Cross Validation Split # unique subjects mapped to unique numbers encoder_subjects = preprocessing.LabelEncoder() encoder_subjects = encoder_subjects.fit( list(tcrrep.clone_df.subject.unique())) # define groups based on subject groups = list(encoder_subjects.transform(tcrrep.clone_df.subject)) # Leave P Groups Out lpgo = LeavePGroupsOut(n_groups=1) lpgo.get_n_splits(X, y, groups) partitions = lpgo.split(X, y, groups) return partitions
class LeavePSubjectsOut(): def __init__(self, subjects_indexes): self.subjects_indexes = subjects_indexes self.splitter = LeavePGroupsOut(np.unique(subjects_indexes)) def split(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.split(X, y, groups) def get_n_splits(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.get_n_splits(X, y, groups)
class Splits(): def __init__(self, sub_indexes, train_size=0.33, n_splits=10, mode='loso'): # bootstrap ou loso self.si = sub_indexes self.train_size = train_size self.n_splits = n_splits self.mode = mode self.create_splits() def create_splits(self, splits=None): if self.mode == 'bootstrap': unique = np.unique(self.si) rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size) splits = [] for train, test in rs.split(unique): train = unique[train] test = unique[test] train_ = np.nonzero([x in train for x in self.si]) test_ = np.nonzero([x in test for x in self.si]) splits.append((train_, test_)) self.splits = splits self.splitter = None elif self.mode == 'groupkfold': self.splitter = GroupKFold(n_splits=self.n_splits) elif self.mode == 'loso': self.splitter = LeaveOneGroupOut() elif self.mode == 'lpso': self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits) def get_n_splits(self, X=None, y=None, groups=None): if self.splitter: return self.splitter.get_n_splits(X, y, groups) return self.n_splits def split(self, X=None, y=None, groups=None): if self.splitter: for i, j in self.splitter.split(X, y, groups): yield i, j else: for tt in self.splits: yield tt
lpgo = LeavePGroupsOut( n_groups=N_GROUPS) #Number of groups to leave out in the test split. groups = np.array(labels_df_filtered['speakers']) cvscores = [] gen = lpgo.split(X_reshape, y, groups) for num, indices in islice( enumerate(gen, 1), # index from 1 instead of 0 START_SPEAKER, None): # loop from 5th onwards train_idx = indices[0] val_idx = indices[1] print(' ===== Fitting CV {} out of {} ====='.format( num, lpgo.get_n_splits(groups=groups))) print(" TRAIN:", np.unique(groups[train_idx])) print(" VAL:", np.unique(groups[val_idx])) train_X = X_reshape[train_idx] train_y = y.iloc[train_idx] val_X = X_reshape[val_idx] val_y = y.iloc[val_idx] train_y = train_y.factorize(sort=True)[0] val_y = val_y.factorize(sort=True)[0] train_weight = get_sample_weight(train_y) val_weight = get_sample_weight(val_y)
START_SPEAKER = 0 loop = [] for i in range(N_TRAIN): print('======== START TRAINING {} OUT OF {} TIMES ========'.format(i+1, N_TRAIN)) lpgo = LeavePGroupsOut(n_groups=N_GROUPS) #Number of groups to leave out in the test split. groups = np.array(labels_df_filtered['speakers']) cvscores = [] gen = lpgo.split(X_reshape, y, groups) for num, indices in islice(enumerate(gen,1), # index from 1 instead of 0 START_SPEAKER, None): # loop from 5th onwards train_idx = indices[0] val_idx = indices[1] print(' ===== Fitting CV {} out of {} ====='.format(num, lpgo.get_n_splits(groups=groups))) print(" TRAIN:", np.unique(groups[train_idx])) print(" VAL:", np.unique(groups[val_idx])) train_X = X_reshape[train_idx] train_y = y.iloc[train_idx] val_X = X_reshape[val_idx] val_y = y.iloc[val_idx] train_y = train_y.factorize(sort=True)[0] val_y = val_y.factorize(sort=True)[0] train_weight = get_sample_weight(train_y)
outer_cv = LeavePGroupsOut(2) # Leave-two-sudy-out inner_cv = LeaveOneOut( ) # do 30-fold quasi-balanced splits within the other two studies for hyperparam optimization. clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring="neg_mean_squared_error", verbose=True, return_train_score=False, n_jobs=-1) all_models = [] best_params = [] predicted = np.zeros(len(y)) nested_scores_train = np.zeros(outer_cv.get_n_splits(X, groups=df.study)) nested_scores_test = np.zeros(outer_cv.get_n_splits(X, groups=df.study)) print("model\tinner_cv mean score\touter vc score") i = 0 for train, test in outer_cv.split(X, y, groups=df.study): print(test) group_train = groups[train] clf.fit(X[train], y[train], groups=group_train) print( str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X[test], y[test]))) all_models.append(clf.best_estimator_) best_params.append(clf.best_params_)
# [ 0.74285061 1.46351659] # [ 2.49913075 1.23133799]] # y # [1 0 2 0 0 1 1 2 0 2 2 1] # -------------------------------------------------------------------------------- # Suppose example data is collected from following group-distribution groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] # -------------------------------------------------------------------------------- # Configure model leave_p_group_out = LeavePGroupsOut(n_groups=2) leave_p_group_out_get_n_splits = leave_p_group_out.get_n_splits(X, y, groups) # print('leave_p_group_out_get_n_splits',leave_p_group_out_get_n_splits) # leave_p_group_out_get_n_splits 6 # -------------------------------------------------------------------------------- for train, test in leave_p_group_out.split(X, y, groups): print('train', train.shape) print('test', test.shape) print('X[test]\n', X[train]) print('y[test]\n', y[test]) print('train groups', np.array(groups)[train]) print('test groups', np.array(groups)[test]) print('')