class DKULeavePGroupsOut(object): def __init__(self, column_name, p): self.column_name = column_name self.splitter = LeavePGroupsOut(p) pass def set_column_labels(self, column_labels): self.column_labels = column_labels def get_n_splits(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] ret = self.splitter.get_n_splits(X, y, groups_array) print("Will use %s splits" % ret) return ret def split(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] return self.splitter.split(X, y, groups_array)
def cross_validation(X, y, pre_x, groups, model='LGB', test_days=1): groups = np.floor((groups + 1) / 2) logo = LeavePGroupsOut(n_groups=test_days) i = 0 pre_sum = np.zeros(pre_x.shape[0]) pre_ = [] print np.isnan(groups).astype(int).sum() print np.unique(groups) ll_ = [] for train, test in logo.split(X, y, groups=groups): i = i + 1 print 'times:', i X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] print X_train.shape, X_test.shape, y_train.shape, y_test.shape if model == 'LGB': pre, ll = LGB(X_train, X_test, y_train, y_test, pre_x) else: pre, ll = LR(X_train, X_test, y_train, y_test, pre_x) ll_ += [ll] pre_ += [pre] weight = [] weight_sum = 0 for l in ll_: weight_sum += 1.0 / l weight += [1.0 / l] for i in range(len(pre_)): pre_sum += pre_[i] * weight[i] / weight_sum print 'weight', weight print 'loss', ll_ return pre_sum
def tmpFUN(dataset, group_label = "groups", n_groups = 2, y_label = "groups", rf_n_estimators = 2000, n_jobs = -1): lpgo = LeavePGroupsOut(n_groups = n_groups) for train_index, validate_index in lpgo.split(X = dataset, y = dataset.loc[:,y_label], groups = dataset.loc[:,group_label]): trainset = dataset.iloc[train_index,:] validateset = dataset.iloc[validate_index,:] X_train = trainset.drop(y_label, axis = 1) y_train = trainset.loc[:,y_label] RF_mod = RandomForestClassifier(n_estimators = rf_n_estimators, n_jobs = n_jobs, class_weight = "balanced") RF_mod.fit(X_train, y_train) RF_pred = RF_mod.predict(X_test)
def create_cv(x, y, subjects, P): """ :param x: :param y: :param N: :return: """ cv = [] lpgo = LeavePGroupsOut(n_groups=P) for train_index, test_index in lpgo.split(x, y, subjects): cv.append((train_index, test_index)) return cv
class LeavePSubjectsOut(): def __init__(self, subjects_indexes): self.subjects_indexes = subjects_indexes self.splitter = LeavePGroupsOut(np.unique(subjects_indexes)) def split(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.split(X, y, groups) def get_n_splits(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.get_n_splits(X, y, groups)
class Splits(): def __init__(self, sub_indexes, train_size=0.33, n_splits=10, mode='loso'): # bootstrap ou loso self.si = sub_indexes self.train_size = train_size self.n_splits = n_splits self.mode = mode self.create_splits() def create_splits(self, splits=None): if self.mode == 'bootstrap': unique = np.unique(self.si) rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size) splits = [] for train, test in rs.split(unique): train = unique[train] test = unique[test] train_ = np.nonzero([x in train for x in self.si]) test_ = np.nonzero([x in test for x in self.si]) splits.append((train_, test_)) self.splits = splits self.splitter = None elif self.mode == 'groupkfold': self.splitter = GroupKFold(n_splits=self.n_splits) elif self.mode == 'loso': self.splitter = LeaveOneGroupOut() elif self.mode == 'lpso': self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits) def get_n_splits(self, X=None, y=None, groups=None): if self.splitter: return self.splitter.get_n_splits(X, y, groups) return self.n_splits def split(self, X=None, y=None, groups=None): if self.splitter: for i, j in self.splitter.split(X, y, groups): yield i, j else: for tt in self.splits: yield tt
def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None): """ returns a generator with train and test set indices based on hold on subject out cross-validation. This is based on the LeavePGroupsOut Parameters ---------- tcrrep : TCRrep class instance TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields Returns ------- partitions : generator object BaseCrossValidator.split from sklearn """ if tcrrep is None: tcrrep = self.tcrrep # unique epitope mapped to unique numbers encoder_epitope = preprocessing.LabelEncoder() encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique())) # `y` target vector y = encoder_epitope.transform(tcrrep.clone_df.epitope) # `X` distance matrix (metric = 'precomputed') X = tcrrep.paired_tcrregex # Cross Validation Split # unique subjects mapped to unique numbers encoder_subjects = preprocessing.LabelEncoder() encoder_subjects = encoder_subjects.fit( list(tcrrep.clone_df.subject.unique())) # define groups based on subject groups = list(encoder_subjects.transform(tcrrep.clone_df.subject)) # Leave P Groups Out lpgo = LeavePGroupsOut(n_groups=1) lpgo.get_n_splits(X, y, groups) partitions = lpgo.split(X, y, groups) return partitions
def split_groups(filenames, labels, groups, size): lpgo = LeavePGroupsOut(n_groups=size) flag = False for i, (train, test) in enumerate(lpgo.split(filenames, labels, groups=groups)): if random() > 0.95: flag = True train_filenames, train_labels, train_groups = np.array( filenames)[train], np.array(labels)[train], np.array( groups)[train] test_filenames, test_labels, test_groups = np.array(filenames)[ test], np.array(labels)[test], np.array(groups)[test] break if not flag: train_filenames, train_labels, train_groups = np.array(filenames)[ train], np.array(labels)[train], np.array(groups)[train] test_filenames, test_labels, test_groups = np.array(filenames)[ test], np.array(labels)[test], np.array(groups)[test] return train_filenames, test_filenames, train_groups, train_labels
def construct_exp_splits(feature_frame, leave_n_out=1): """ Constructs a list of (train,test) splits for a feature_frame representing a set of experiments. These splits used integer based (as opposed to label based) indexing of feature_frame. Input: feature_frame : DataFrame A pandas dataframe returned by extract_features_targets representing multiple experiments leave_n_out : int The number of experiments to leave out in each cross validation fold Returns: [(Array, Array)] A list of (train index, test index) splits """ groups = feature_frame.index.get_level_values(0) logo = LeavePGroupsOut(n_groups=leave_n_out) df_mat = feature_frame.values cv_splits = [ (train_index, test_index) for train_index, test_index in logo.split(df_mat, groups=groups) ] return cv_splits
def def_get_n_psplits(X, y, groups, p, n): splitter = LeavePGroupsOut(n_groups=p) splits = list(splitter.split(X, y, groups)) ids = np.random.choice(len(splits), n).tolist() list_random_selected = [splits[i] for i in ids] return (list_random_selected)
N_GROUPS = 1 N_EPOCH = 1 N_TRAIN = 2 START_SPEAKER = 0 loop = [] for i in range(N_TRAIN): print('======== START TRAINING {} OUT OF {} TIMES ========'.format( i + 1, N_TRAIN)) lpgo = LeavePGroupsOut( n_groups=N_GROUPS) #Number of groups to leave out in the test split. groups = np.array(labels_df_filtered['speakers']) cvscores = [] gen = lpgo.split(X_reshape, y, groups) for num, indices in islice( enumerate(gen, 1), # index from 1 instead of 0 START_SPEAKER, None): # loop from 5th onwards train_idx = indices[0] val_idx = indices[1] print(' ===== Fitting CV {} out of {} ====='.format( num, lpgo.get_n_splits(groups=groups))) print(" TRAIN:", np.unique(groups[train_idx])) print(" VAL:", np.unique(groups[val_idx])) train_X = X_reshape[train_idx] train_y = y.iloc[train_idx]
RF = False if RF == True: seed(0) predict = False # True, False save_model_path = './models/RF/' check_dirs.check_dir(save_model_path) #kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=None) lppo = LeavePGroupsOut(n_groups=1) fold_no = 1 f1_per_fold, acc_per_fold, pre_per_fold, rec_per_fold = [], [], [], [] features = np.reshape(features, (features.shape[0], features.shape[1])) # training if not predict: #for train, test in kfold.split(features, labels): for train, test in lppo.split(features_new, labels_new, groups=groups_new): feat_train, labels_train = features_new[train], labels_new[ train].reshape((len(labels_new[train]))) feat_test, labels_test = features_new[test], labels_new[ test].reshape((len(labels_new[test]))) clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1) # Fit data to model, then save models clf.fit(feat_train, labels_train) para = clf.get_params() # name: RandomForestClassifier_fold%d_estimators_%d filename = 'fold%d_' % fold_no + str(clf).split( '(')[0] + '_estimators_%d' % para['n_estimators'] joblib.dump(clf, save_model_path + filename)
param_grid=p_grid, cv=inner_cv, scoring="neg_mean_squared_error", verbose=True, return_train_score=False, n_jobs=-1) all_models = [] best_params = [] predicted = np.zeros(len(y)) nested_scores_train = np.zeros(outer_cv.get_n_splits(X, groups=df.study)) nested_scores_test = np.zeros(outer_cv.get_n_splits(X, groups=df.study)) print("model\tinner_cv mean score\touter vc score") i = 0 for train, test in outer_cv.split(X, y, groups=df.study): print(test) group_train = groups[train] clf.fit(X[train], y[train], groups=group_train) print( str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X[test], y[test]))) all_models.append(clf.best_estimator_) best_params.append(clf.best_params_) predicted[test] += clf.predict( X[test]) # added, to later construct average nested_scores_train[i] = clf.best_score_
# .. note:: # Split is made to generate each fold for tr, vl in LPSGO.split(X, y, g): print(tr.shape, vl.shape) print('y label with number of samples') print(np.unique(y[tr], return_counts=True)) ############################################################################## # Differences with scikit-learn # ------------------------------------------- from sklearn.model_selection import LeavePGroupsOut # You need to specify the number of groups LPGO = LeavePGroupsOut(n_groups=2) for tr, vl in LPGO.split(X, y, g): print(tr.shape, vl.shape) ############################################################################## # With GroupShuffleSplit, won't keep the percentage per subgroup # This generate unbalanced classes from sklearn.model_selection import GroupShuffleSplit GSS = GroupShuffleSplit(test_size=0.5, n_splits=2) for tr, vl in GSS.split(X, y, g): print(tr.shape, vl.shape) print('y label with number of samples') print(np.unique(y[tr], return_counts=True)) ###############################################################################
from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import accuracy_score, balanced_accuracy_score N_GROUPS = 1 N_EPOCH = 10 N_TRAIN = 1 loop = [] for i in range(N_TRAIN): print('======== START TRAINING {} OUT OF {} TIMES ========'.format(i, N_TRAIN)) lpgo = LeavePGroupsOut(n_groups=N_GROUPS) #Number of groups to leave out in the test split. groups = np.array(labels_df_filtered['speakers']) cvscores = [] for num, indices in enumerate(lpgo.split(X, y, groups)): train_idx = indices[0] val_idx = indices[1] print(' ===== Fitting CV {} out of {} ====='.format(num, lpgo.get_n_splits(groups=groups))) print(" TRAIN:", np.unique(groups[train_idx])) print(" VAL:", np.unique(groups[val_idx])) train_X = X[train_idx] train_y = y.iloc[train_idx] val_X = X[val_idx] val_y = y.iloc[val_idx] train_X = train_X.reshape((train_X.shape[0],) + (1,) + train_X.shape[1:])
# ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split========================================== X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] # k折分组 gkf = GroupKFold(n_splits=3) # 训练集和测试集属于不同的组 for train, test in gkf.split(X, y, groups=groups): print("组 k-fold分割:%s %s" % (train, test)) # 留一分组 logo = LeaveOneGroupOut() for train, test in logo.split(X, y, groups=groups): print("留一组分割:%s %s" % (train, test)) # 留p分组 lpgo = LeavePGroupsOut(n_groups=2) for train, test in lpgo.split(X, y, groups=groups): print("留 P 组分割:%s %s" % (train, test)) # 随机分组 gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0) for train, test in gss.split(X, y, groups=groups): print("随机分割:%s %s" % (train, test)) # ==================================时间序列分割========================================== tscv = TimeSeriesSplit(n_splits=3) TimeSeriesSplit(max_train_size=None, n_splits=3) for train, test in tscv.split(iris.data): print("时间序列分割:%s %s" % (train, test))
# [ 0.74285061 1.46351659] # [ 2.49913075 1.23133799]] # y # [1 0 2 0 0 1 1 2 0 2 2 1] # -------------------------------------------------------------------------------- # Suppose example data is collected from following group-distribution groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] # -------------------------------------------------------------------------------- # Configure model leave_p_group_out = LeavePGroupsOut(n_groups=2) leave_p_group_out_get_n_splits = leave_p_group_out.get_n_splits(X, y, groups) # print('leave_p_group_out_get_n_splits',leave_p_group_out_get_n_splits) # leave_p_group_out_get_n_splits 6 # -------------------------------------------------------------------------------- for train, test in leave_p_group_out.split(X, y, groups): print('train', train.shape) print('test', test.shape) print('X[test]\n', X[train]) print('y[test]\n', y[test]) print('train groups', np.array(groups)[train]) print('test groups', np.array(groups)[test]) print('')