def split_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: # The entry test_fold[i] represents the index of the test set that sample i belongs to. # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1. test_fold = [0, 1, -1, 1] split = PredefinedSplit(test_fold) print('#splits =', split.get_n_splits(X, y)) elif False: # The stratified folds are made by preserving the percentage of samples for each class. split = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X, y)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. split = model_selection.GroupShuffleSplit(n_splits=3, test_size=0.25, random_state=None) #print('#splits =', split.get_n_splits(X, y, groups)) print('#splits =', split.get_n_splits(groups=groups)) elif False: split = model_selection.TimeSeriesSplit(n_splits=3, max_train_size=None) print('#splits =', split.get_n_splits()) else: split = model_selection.ShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X)) print('Split:', split) #for train_indices, test_indices in split.split(): #for train_indices, test_indices in split.split(X, y): #for train_indices, test_indices in split.split(X, y, groups): for train_indices, test_indices in split.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def temp(samples): from sklearn import model_selection from wbia.algo.verif import sklearn_utils def check_balance(idxs): # from sklearn.utils.fixes import bincount logger.info('-------') for count, (test, train) in enumerate(idxs): logger.info('split %r' % (count)) groups_train = set(groups.take(train)) groups_test = set(groups.take(test)) n_group_isect = len(groups_train.intersection(groups_test)) y_train_freq = bincount(y.take(train)) y_test_freq = bincount(y.take(test)) y_test_ratio = y_test_freq / y_test_freq.sum() y_train_ratio = y_train_freq / y_train_freq.sum() balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2) logger.info('n_group_isect = %r' % (n_group_isect,)) logger.info('y_test_ratio = %r' % (y_test_ratio,)) logger.info('y_train_ratio = %r' % (y_train_ratio,)) logger.info('balance_error = %r' % (balance_error,)) X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids n_splits = 3 splitter = model_selection.GroupShuffleSplit(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.GroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.StratifiedKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs)
def train_test_split(self, subjects, test_size=.2, random_state=None, return_index=False): '''Define a train test split on input subjects, with a given target test size. Parameters ---------- subjects : array-like `subjects` should be a pandas index or numpy array of subjects. They should correspond to any subject indexed groups or stratify. test_size : float, int or None, optional If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to be included in the test split. If int, represents the absolute number (or target number) to include in the testing group. (default = .2) random_state : int or None, optional Optionally can provide a random state, in order to be able to recreate exact splits. (default=None) Returns ---------- array-like The training subjects as computed by the split array-like The testing subjects as computed by the split ''' original_subjects, subjects, train_only = self.get_train_only(subjects) if self.groups is not None: splitter = MS.GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) [*inds] = splitter.split(subjects, groups=self.groups.loc[subjects]) elif self.stratify is not None: splitter = MS.StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) [*inds] = splitter.split(subjects, y=self.stratify.loc[subjects]) else: splitter = MS.ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) [*inds] = splitter.split(subjects) inds = inds[0] train_subjects, test_subjects = subjects[inds[0]], subjects[inds[1]] train_subjects = np.concatenate([train_subjects, train_only]) if return_index: return ([ original_subjects.get_loc(name) for name in train_subjects ], [original_subjects.get_loc(name) for name in test_subjects]) return train_subjects, test_subjects
x_train = np.load("x_train_kaggle.npy") y_train_data = np.genfromtxt("groups.csv", delimiter=',', dtype=[('id', np.uint), ('group_id', np.uint), ('surface', 'S22')]) y_train = y_train_data['surface'] xx_test = np.load("x_test_kaggle.npy") # %% Transform data le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) splitter = model_selection.GroupShuffleSplit(n_splits=36, test_size=0.2) tmp_sp = splitter.get_n_splits(X=x_train, y=y_train, groups=y_train_data['group_id']) # tmp [X_train, X_test, Y_train, Y_test] for train_i, test_i in tmp_sp.split(x_train, y_train, [y_train_data['group_id']]): print("TRAIN:", train_i, "TEST:", test_i) aX_train, aX_test = x_train[train_i], x_train[test_i] ay_train, ay_test = y_train[train_i], y_train[test_i] print(aX_train, aX_test, ay_train, ay_test) # %% X_train = np.array([x.ravel() for x in X_train])
for train_indices, test_indices in logo.split(X, y, groups=groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Leave P Groups out print('{0:-^70}'.format('Leave P Groups out')) groups = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5] # 共5组 lpgo = sm.LeavePGroupsOut(n_groups=2) print('Leave P Groups out class: ', lpgo) print('splits of lpgo: ', lpgo.get_n_splits(X, y, groups=groups)) # Combine(5, 2) = 10 for train_indices, test_indices in lpgo.split(X, y, groups=groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Group Shuffle Split print('{0:-^70}'.format('Group Shuffle Split')) gss = sm.GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0) print('Group Shuffle Split class: ', gss) print('splits of gss: ', gss.get_n_splits(X, y, groups=groups)) # 等于n_splits for train_indices, test_indices in gss.split(X, y, groups=groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Time Series Split, 按时间顺序的分割数据,训练数据集必须是连续的,不能打乱,只留出最后的几个为测试数据 print('{0:-^70}'.format('Time Series Split')) X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4, 5, 6]) print('Time Series X: \n', X) print('y: ', y) tscv = sm.TimeSeriesSplit(n_splits=3) # 留出最后3个作为 print('Time Series Split class: ', tscv) for train_indices, test_indices in tscv.split(X): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)
criterion='gini', max_depth=20, random_state=50))) models.append( ('DTree', DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=70))) models.append(('SVC', SVC(C=100, kernel='rbf', degree=3, gamma=0.001))) # evaluate each model in turn results = [] names = [] scoring = 'accuracy' for name, model in models: cv_results = [] sfs = model_selection.GroupShuffleSplit(n_splits=6, test_size=0.3, random_state=0) for tr, tt in sfs.split(data_train, output_train, groups_at_training): # Fill in indices with the training/test groups X_train, X_test = data_train[tr], data_train[tt] y_train, y_test = output_train[tr], output_train[tt] if name == 'LDA': final_model = model.fit(X_train, y_train) else: final_model = model.fit(X_train, y_train, groups_at_training[tr]) pred_test = final_model.predict(X_test) cv_results.append(accuracy_score(y_test, pred_test)) print(cv_results) names.append(name)
target_classes = to_categorical(classes_array) # Feature Data img_3d_features = np.array(extract_as_3d_image(train_data)) for i in range(0, show_images): image_index = randint(0, img_3d_features.shape[0]) plt.imshow(img_3d_features[image_index]) plt.title(f'Class: {le.inverse_transform([classes_array[image_index]])[0]}') plt.show() #img_3d_features = train_data # Split the groups to training and testing data. The testing data should only # be used in the final evaluation of the model and thus never included in # training. scores = [] gss = model_selection.GroupShuffleSplit(n_splits=num_splits, test_size=0.1) split = gss.split(groups_csv[:, 0], le.transform(groups_csv[:, 2]), groups_csv[:, 1]) round = 0 for tr, ev in split: print("\n==============================================================") print(f"======================= SPLIT {round+1}/{num_splits} =======================") print("==============================================================\n") F_train = img_3d_features[tr] y_train = target_classes[tr] training_groups = np.array(groups_csv[:, 1])[tr] F_test = np.array(img_3d_features[ev]) y_test = target_classes[ev] print(f'Using a total of {len(tr)} groups for training, and {len(ev)} for final evaluation after training the model...') #LSTM Structure for raw training data
train_data = np.load('X_train_kaggle.npy') all_id_classes = np.genfromtxt('y_train_final_kaggle.csv', delimiter=',', dtype='str') groups_csv = np.genfromtxt('groups.csv', delimiter=',', dtype='str') le = preprocessing.LabelEncoder() le.fit(all_id_classes[:, 1]) all_id_classes_transformed = le.transform(all_id_classes[:, 1]) classes_array = np.array(all_id_classes_transformed) #Transform labels to n x 9 vectors target_classes = to_categorical(classes_array) # Split the groups to training and validation data. gss = model_selection.GroupShuffleSplit(n_splits=1, test_size=0.2) data_split = gss.split(groups_csv[:, 0], groups_csv[:, 2], groups_csv[:, 1]) #Feature Data ravel_data = np.array(extract_ravel(train_data)) mean_data = np.array(extract_mean(train_data)) var_mean_data = np.array(extract_var_mean(train_data)) chanel_var_mean = np.array(extract_chanel_var_mean(train_data)) #Reshape mean data from (1703, 10) to (1703, 10, 1) mean_data = mean_data.reshape([int(len(mean_data)), 10, 1]) var_mean_data = var_mean_data.reshape(int(len(var_mean_data)), 2, 1) weight_l1 = 0.001
all_id_classes = np.genfromtxt('y_train_final_kaggle.csv',delimiter=',',dtype='str') #groups_csv = pd.read_csv('groups.csv').values groups_csv = np.genfromtxt('groups.csv',delimiter=',',dtype='str') le = preprocessing.LabelEncoder() le.fit(all_id_classes[:,1]) all_id_classes_transformed = le.transform(all_id_classes[:,1]) classes_array = np.array(all_id_classes_transformed) # Feature data statistical_features = np.array(extract_statistical(train_data)) ## Split the groups to training and testing data. The testing data should only # be used in the final evaluation of the model and thus never included in # training. number_of_splits = 50 gss = model_selection.GroupShuffleSplit(n_splits=number_of_splits, test_size=0.2, random_state=0) data_split = gss.split(groups_csv[:, 0], le.transform(groups_csv[:, 2]), groups_csv[:, 1]) clf_name_list = ['RandomForestClassifier()'] score_list = [[] for i in range(len(clf_name_list))] round = 1 for train, test in data_split: clf_list = [ RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0, criterion=criterion, bootstrap=False, n_jobs=-1) ] print(f'======== ROUND {round} =========') y_train = classes_array[train] y_validation = classes_array[test] F_train = statistical_features[train] F_validation = statistical_features[test]