예제 #1
0
def split_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        # The entry test_fold[i] represents the index of the test set that sample i belongs to.
        # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1.
        test_fold = [0, 1, -1, 1]
        split = PredefinedSplit(test_fold)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The stratified folds are made by preserving the percentage of samples for each class.
        split = model_selection.StratifiedShuffleSplit(n_splits=3,
                                                       test_size=0.25,
                                                       random_state=None)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        split = model_selection.GroupShuffleSplit(n_splits=3,
                                                  test_size=0.25,
                                                  random_state=None)
        #print('#splits =', split.get_n_splits(X, y, groups))
        print('#splits =', split.get_n_splits(groups=groups))
    elif False:
        split = model_selection.TimeSeriesSplit(n_splits=3,
                                                max_train_size=None)
        print('#splits =', split.get_n_splits())
    else:
        split = model_selection.ShuffleSplit(n_splits=3,
                                             test_size=0.25,
                                             random_state=None)
        print('#splits =', split.get_n_splits(X))
    print('Split:', split)

    #for train_indices, test_indices in split.split():
    #for train_indices, test_indices in split.split(X, y):
    #for train_indices, test_indices in split.split(X, y, groups):
    for train_indices, test_indices in split.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
예제 #2
0
def temp(samples):
    from sklearn import model_selection
    from wbia.algo.verif import sklearn_utils

    def check_balance(idxs):
        # from sklearn.utils.fixes import bincount
        logger.info('-------')
        for count, (test, train) in enumerate(idxs):
            logger.info('split %r' % (count))
            groups_train = set(groups.take(train))
            groups_test = set(groups.take(test))
            n_group_isect = len(groups_train.intersection(groups_test))
            y_train_freq = bincount(y.take(train))
            y_test_freq = bincount(y.take(test))
            y_test_ratio = y_test_freq / y_test_freq.sum()
            y_train_ratio = y_train_freq / y_train_freq.sum()
            balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2)
            logger.info('n_group_isect = %r' % (n_group_isect,))
            logger.info('y_test_ratio = %r' % (y_test_ratio,))
            logger.info('y_train_ratio = %r' % (y_train_ratio,))
            logger.info('balance_error = %r' % (balance_error,))

    X = np.empty((len(samples), 0))
    y = samples.encoded_1d().values
    groups = samples.group_ids

    n_splits = 3

    splitter = model_selection.GroupShuffleSplit(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.GroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.StratifiedKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)
예제 #3
0
    def train_test_split(self,
                         subjects,
                         test_size=.2,
                         random_state=None,
                         return_index=False):
        '''Define a train test split on input subjects, with a given target
        test size.

        Parameters
        ----------
        subjects : array-like
            `subjects` should be a pandas index or numpy array of subjects.
            They should correspond to any subject indexed groups or stratify.

        test_size : float, int or None, optional
            If float, should be between 0.0 and 1.0 and represent
            the proportion of the dataset to be included in the test split.
            If int, represents the absolute number (or target number) to
            include in the testing group.
            (default = .2)

        random_state : int or None, optional
            Optionally can provide a random state, in
            order to be able to recreate exact splits.
            (default=None)

        Returns
        ----------
        array-like
            The training subjects as computed by the split

        array-like
            The testing subjects as computed by the split
        '''

        original_subjects, subjects, train_only = self.get_train_only(subjects)

        if self.groups is not None:
            splitter = MS.GroupShuffleSplit(n_splits=1,
                                            test_size=test_size,
                                            random_state=random_state)
            [*inds] = splitter.split(subjects,
                                     groups=self.groups.loc[subjects])

        elif self.stratify is not None:
            splitter = MS.StratifiedShuffleSplit(n_splits=1,
                                                 test_size=test_size,
                                                 random_state=random_state)
            [*inds] = splitter.split(subjects, y=self.stratify.loc[subjects])

        else:
            splitter = MS.ShuffleSplit(n_splits=1,
                                       test_size=test_size,
                                       random_state=random_state)
            [*inds] = splitter.split(subjects)

        inds = inds[0]

        train_subjects, test_subjects = subjects[inds[0]], subjects[inds[1]]
        train_subjects = np.concatenate([train_subjects, train_only])

        if return_index:
            return ([
                original_subjects.get_loc(name) for name in train_subjects
            ], [original_subjects.get_loc(name) for name in test_subjects])

        return train_subjects, test_subjects
예제 #4
0
x_train = np.load("x_train_kaggle.npy")
y_train_data = np.genfromtxt("groups.csv",
                             delimiter=',',
                             dtype=[('id', np.uint), ('group_id', np.uint),
                                    ('surface', 'S22')])
y_train = y_train_data['surface']
xx_test = np.load("x_test_kaggle.npy")

# %% Transform data

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

splitter = model_selection.GroupShuffleSplit(n_splits=36, test_size=0.2)
tmp_sp = splitter.get_n_splits(X=x_train,
                               y=y_train,
                               groups=y_train_data['group_id'])
# tmp [X_train, X_test, Y_train, Y_test]

for train_i, test_i in tmp_sp.split(x_train, y_train,
                                    [y_train_data['group_id']]):
    print("TRAIN:", train_i, "TEST:", test_i)
    aX_train, aX_test = x_train[train_i], x_train[test_i]
    ay_train, ay_test = y_train[train_i], y_train[test_i]
    print(aX_train, aX_test, ay_train, ay_test)

# %%

X_train = np.array([x.ravel() for x in X_train])
예제 #5
0
for train_indices, test_indices in logo.split(X, y, groups=groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Leave P Groups out
print('{0:-^70}'.format('Leave P Groups out'))
groups = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]  # 共5组
lpgo = sm.LeavePGroupsOut(n_groups=2)
print('Leave P Groups out class: ', lpgo)
print('splits of lpgo: ',
      lpgo.get_n_splits(X, y, groups=groups))  # Combine(5, 2)  = 10
for train_indices, test_indices in lpgo.split(X, y, groups=groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Group Shuffle Split
print('{0:-^70}'.format('Group Shuffle Split'))
gss = sm.GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
print('Group Shuffle Split class: ', gss)
print('splits of gss: ', gss.get_n_splits(X, y, groups=groups))  # 等于n_splits
for train_indices, test_indices in gss.split(X, y, groups=groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Time Series Split, 按时间顺序的分割数据,训练数据集必须是连续的,不能打乱,只留出最后的几个为测试数据
print('{0:-^70}'.format('Time Series Split'))
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
print('Time Series X: \n', X)
print('y: ', y)
tscv = sm.TimeSeriesSplit(n_splits=3)  # 留出最后3个作为
print('Time Series Split class: ', tscv)
for train_indices, test_indices in tscv.split(X):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)
                                      criterion='gini',
                                      max_depth=20,
                                      random_state=50)))
models.append(
    ('DTree', DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=70)))
models.append(('SVC', SVC(C=100, kernel='rbf', degree=3, gamma=0.001)))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    cv_results = []
    sfs = model_selection.GroupShuffleSplit(n_splits=6,
                                            test_size=0.3,
                                            random_state=0)
    for tr, tt in sfs.split(data_train, output_train, groups_at_training):
        # Fill in indices with the training/test groups
        X_train, X_test = data_train[tr], data_train[tt]
        y_train, y_test = output_train[tr], output_train[tt]
        if name == 'LDA':
            final_model = model.fit(X_train, y_train)
        else:
            final_model = model.fit(X_train, y_train, groups_at_training[tr])

        pred_test = final_model.predict(X_test)
        cv_results.append(accuracy_score(y_test, pred_test))
        print(cv_results)

    names.append(name)
예제 #7
0
    target_classes = to_categorical(classes_array)

    # Feature Data
    img_3d_features = np.array(extract_as_3d_image(train_data))
    for i in range(0, show_images):
        image_index = randint(0, img_3d_features.shape[0])
        plt.imshow(img_3d_features[image_index])
        plt.title(f'Class: {le.inverse_transform([classes_array[image_index]])[0]}')
        plt.show()
    #img_3d_features = train_data

    # Split the groups to training and testing data. The testing data should only
    # be used in the final evaluation of the model and thus never included in
    # training.
    scores = []
    gss = model_selection.GroupShuffleSplit(n_splits=num_splits, test_size=0.1)
    split = gss.split(groups_csv[:, 0], le.transform(groups_csv[:, 2]), groups_csv[:, 1])
    round = 0
    for tr, ev in split:
        print("\n==============================================================")
        print(f"=======================   SPLIT {round+1}/{num_splits}   =======================")
        print("==============================================================\n")
        F_train = img_3d_features[tr]
        y_train = target_classes[tr]
        training_groups = np.array(groups_csv[:, 1])[tr]
        F_test = np.array(img_3d_features[ev])
        y_test = target_classes[ev]

        print(f'Using a total of {len(tr)} groups for training, and {len(ev)} for final evaluation after training the model...')

        #LSTM Structure for raw training data
예제 #8
0
    train_data = np.load('X_train_kaggle.npy')
    all_id_classes = np.genfromtxt('y_train_final_kaggle.csv',
                                   delimiter=',',
                                   dtype='str')
    groups_csv = np.genfromtxt('groups.csv', delimiter=',', dtype='str')

    le = preprocessing.LabelEncoder()
    le.fit(all_id_classes[:, 1])
    all_id_classes_transformed = le.transform(all_id_classes[:, 1])
    classes_array = np.array(all_id_classes_transformed)

    #Transform labels to n x 9 vectors
    target_classes = to_categorical(classes_array)

    # Split the groups to training and validation data.
    gss = model_selection.GroupShuffleSplit(n_splits=1, test_size=0.2)
    data_split = gss.split(groups_csv[:, 0], groups_csv[:, 2], groups_csv[:,
                                                                          1])

    #Feature Data
    ravel_data = np.array(extract_ravel(train_data))
    mean_data = np.array(extract_mean(train_data))
    var_mean_data = np.array(extract_var_mean(train_data))
    chanel_var_mean = np.array(extract_chanel_var_mean(train_data))

    #Reshape mean data from (1703, 10) to (1703, 10, 1)
    mean_data = mean_data.reshape([int(len(mean_data)), 10, 1])
    var_mean_data = var_mean_data.reshape(int(len(var_mean_data)), 2, 1)

    weight_l1 = 0.001
예제 #9
0
    all_id_classes = np.genfromtxt('y_train_final_kaggle.csv',delimiter=',',dtype='str')
    #groups_csv = pd.read_csv('groups.csv').values
    groups_csv = np.genfromtxt('groups.csv',delimiter=',',dtype='str')
    le = preprocessing.LabelEncoder()
    le.fit(all_id_classes[:,1])
    all_id_classes_transformed = le.transform(all_id_classes[:,1])
    classes_array = np.array(all_id_classes_transformed)

    # Feature data
    statistical_features = np.array(extract_statistical(train_data))

    ## Split the groups to training and testing data. The testing data should only
    # be used in the final evaluation of the model and thus never included in
    # training.
    number_of_splits = 50
    gss = model_selection.GroupShuffleSplit(n_splits=number_of_splits, test_size=0.2, random_state=0)
    data_split = gss.split(groups_csv[:, 0], le.transform(groups_csv[:, 2]), groups_csv[:, 1])

    clf_name_list = ['RandomForestClassifier()']
    score_list = [[] for i in range(len(clf_name_list))]

    round = 1
    for train, test in data_split:
        clf_list = [
            RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0, criterion=criterion, bootstrap=False, n_jobs=-1)
        ]
        print(f'======== ROUND {round} =========')
        y_train = classes_array[train]
        y_validation = classes_array[test]
        F_train = statistical_features[train]
        F_validation = statistical_features[test]