def test_split():

    # test on DataArray with number of samples multiple of new length
    X_da = xr.DataArray(
        np.random.random((100, 10)),
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample", "feature"], np.tile("Test", (100, 10))),
        },
        dims=("sample", "feature"),
    )

    estimator = Splitter(
        new_dim="split_sample",
        new_len=5,
        reduce_index="subsample",
        axis=1,
        keep_coords_as="sample_coord",
    )

    Xt_da = estimator.fit_transform(X_da)

    assert Xt_da.shape == (20, 5, 10)
    npt.assert_allclose(Xt_da[0, :, 0], X_da[:5, 0])

    Xit_da = estimator.inverse_transform(Xt_da)

    xrt.assert_allclose(X_da, Xit_da)

    # test on Dataset with number of samples NOT multiple of new length
    X_ds = xr.Dataset(
        {"var_1": (["sample", "feature"], np.random.random((100, 10)))},
        coords={
            "sample": range(100),
            "feature": range(10)
        },
    )

    Xt_ds = split(
        X_ds,
        new_dim="split_sample",
        new_len=7,
        reduce_index="head",
        axis=1,
        new_index_func=None,
    )

    assert Xt_ds["var_1"].shape == (14, 7, 10)
    npt.assert_allclose(Xt_ds.var_1[0, :, 0], X_ds.var_1[:7, 0])
Exemplo n.º 2
0
def test_split():

    # test on DataArray with number of samples multiple of new length
    X_da = xr.DataArray(np.random.random((100, 10)),
                        coords={
                            'sample':
                            range(100),
                            'feature':
                            range(10),
                            'coord_1':
                            (['sample', 'feature'], np.tile('Test', (100, 10)))
                        },
                        dims=('sample', 'feature'))

    estimator = Splitter(new_dim='split_sample',
                         new_len=5,
                         reduce_index='subsample',
                         axis=1,
                         keep_coords_as='sample_coord')

    Xt_da = estimator.fit_transform(X_da)

    assert Xt_da.shape == (20, 5, 10)
    npt.assert_allclose(Xt_da[0, :, 0], X_da[:5, 0])

    Xit_da = estimator.inverse_transform(Xt_da)

    xrt.assert_allclose(X_da, Xit_da)

    # test on Dataset with number of samples NOT multiple of new length
    X_ds = xr.Dataset(
        {'var_1': (['sample', 'feature'], np.random.random((100, 10)))},
        coords={
            'sample': range(100),
            'feature': range(10)
        })

    Xt_ds = split(X_ds,
                  new_dim='split_sample',
                  new_len=7,
                  reduce_index='head',
                  axis=1,
                  new_index_func=None)

    assert Xt_ds['var_1'].shape == (14, 7, 10)
    npt.assert_allclose(Xt_ds.var_1[0, :, 0], X_ds.var_1[:7, 0])
axarr[2].set_xlabel('Time [s]')
axarr[2].set_title('Acceleration along z-axis')


##############################################################################
# Then we define a pipeline with various preprocessing steps and a classifier.
#
# The preprocessing consists of splitting the data into segments, removing
# segments with `nan` values and standardizing. Since the accelerometer data is
# three-dimensional but the standardizer and classifier expect a one-dimensional
# feature vector, we have to vectorize the samples.
#
# Finally, we use PCA and a naive Bayes classifier for classification.

pl = Pipeline([
    ('splitter', Splitter(
        groupby=['subject', 'activity'], new_dim='timepoint', new_len=30)),
    ('sanitizer', Sanitizer()),
    ('featurizer', Featurizer()),
    ('scaler', wrap(StandardScaler)),
    ('pca', wrap(PCA, reshapes='feature')),
    ('cls', wrap(GaussianNB, reshapes='feature'))
])

##############################################################################
# Since we want to use cross-validated grid search to find the best model
# parameters, we define a cross-validator. In order to make sure the model
# performs subject-independent recognition, we use a `GroupShuffleSplit`
# cross-validator that ensures that the same subject will not appear in both
# training and validation set.

cv = CrossValidatorWrapper(
axarr[2].plot(X_plot.sample, X_plot.sel(axis='z'), color='#2ca02c')
axarr[2].set_xlabel('Time [s]')
axarr[2].set_title('Acceleration along z-axis')

##############################################################################
# Then we define a pipeline with various preprocessing steps and a classifier.
#
# The preprocessing consists of splitting the data into segments, removing
# segments with `nan` values and standardizing. Since the accelerometer data is
# three-dimensional but the standardizer and classifier expect a one-dimensional
# feature vector, we have to vectorize the samples.
#
# Finally, we use PCA and logistic regression to perform the classification.

pl = Pipeline([('splitter',
                Splitter(groupby=['subject', 'activity'],
                         new_dim='timepoint')), ('sanitizer', Sanitizer()),
               ('featurizer', Featurizer()), ('scaler', wrap(StandardScaler)),
               ('pca', wrap(PCA, reshapes='feature')),
               ('lr', wrap(LogisticRegression, reshapes='feature'))])

##############################################################################
# Since we want to use cross-validated grid search to find the best model
# parameters, we define a cross-validator. In order to make sure the model
# performs subject-independent recognition, we use a `GroupShuffleSplit`
# cross-validator that ensures that the same subject will not appear in both
# training and validation set.

cv = CrossValidatorWrapper(GroupShuffleSplit(n_splits=3, test_size=0.3),
                           groupby=['subject'])

##############################################################################
##############################################################################
# Then we define a pipeline with various preprocessing steps and a classifier.
#
# The preprocessing consists of splitting the data into segments, removing
# segments with `nan` values and standardizing. Since the accelerometer data is
# three-dimensional but the standardizer and classifier expect a
# one-dimensional feature vector, we have to vectorize the samples.
#
# Finally, we use PCA and a naive Bayes classifier for classification.

pl = Pipeline([
    (
        "splitter",
        Splitter(
            groupby=["subject", "activity"],
            new_dim="timepoint",
            new_len=30,
        ),
    ),
    ("sanitizer", Sanitizer()),
    ("featurizer", Featurizer()),
    ("scaler", wrap(StandardScaler)),
    ("pca", wrap(PCA, reshapes="feature")),
    ("cls", wrap(GaussianNB, reshapes="feature")),
])

##############################################################################
# Since we want to use cross-validated grid search to find the best model
# parameters, we define a cross-validator. In order to make sure the model
# performs subject-independent recognition, we use a `GroupShuffleSplit`
# cross-validator that ensures that the same subject will not appear in both