예제 #1
0
def test_multidim_coord():

    coord_1 = np.tile(['a'] * 51 + ['b'] * 49, (10, 1)).T
    coord_2 = np.random.random((100, 10, 10))

    X_ds = xr.Dataset(
        {
            'var_1':
            (['sample', 'feat_1', 'feat_2'], np.random.random((100, 10, 10)))
        },
        coords={
            'sample': range(100),
            'feature': range(10),
            'coord_1': (['sample', 'feat_1'], coord_1),
            'coord_2': (['sample', 'feat_1', 'feat_2'], coord_2)
        })

    target_1 = Target(coord='coord_1',
                      transform_func=LabelBinarizer().fit_transform,
                      dim='sample')(X_ds)
    target_2 = Target(coord='coord_2',
                      dim=['sample', 'feat_1'],
                      reduce_func=np.mean)(X_ds)

    npt.assert_equal(target_1, LabelBinarizer().fit_transform(coord_1[:, 0]))
    npt.assert_equal(target_2, np.mean(coord_2, 2))
예제 #2
0
def test_getitem():

    coord_1 = ['a'] * 51 + ['b'] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {'var_1': (['sample', 'feature'], np.random.random((100, 10)))},
        coords={
            'sample': range(100),
            'feature': range(10),
            'coord_1': (['sample'], coord_1),
            'coord_2': (['sample'], coord_2)
        })

    target = Target(coord='coord_1',
                    transform_func=LabelBinarizer().fit_transform)(X_ds)

    y_test = target[-1]

    assert y_test == LabelBinarizer().fit_transform(coord_1)[-1]

    # test lazy eval
    target = Target(coord='coord_1',
                    transform_func=LabelBinarizer().fit_transform,
                    lazy=True)(X_ds)

    y_test = target[-1]

    assert y_test == LabelBinarizer().fit_transform(coord_1)[-1]
    assert not y_test.lazy
예제 #3
0
def test_getitem():

    coord_1 = ["a"] * 51 + ["b"] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {"var_1": (["sample", "feature"], np.random.random((100, 10)))},
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample"], coord_1),
            "coord_2": (["sample"], coord_2),
        },
    )

    target = Target(coord="coord_1",
                    transform_func=LabelBinarizer().fit_transform)(X_ds)

    y_test = target[-1]

    assert y_test == LabelBinarizer().fit_transform(coord_1)[-1]

    # test lazy eval
    target = Target(
        coord="coord_1",
        transform_func=LabelBinarizer().fit_transform,
        lazy=True,
    )(X_ds)

    y_test = target[-1]

    assert y_test == LabelBinarizer().fit_transform(coord_1)[-1]
    assert not y_test.lazy
예제 #4
0
def test_multidim_coord():

    coord_1 = np.tile(["a"] * 51 + ["b"] * 49, (10, 1)).T
    coord_2 = np.random.random((100, 10, 10))

    X_ds = xr.Dataset(
        {
            "var_1": (
                ["sample", "feat_1", "feat_2"],
                np.random.random((100, 10, 10)),
            )
        },
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample", "feat_1"], coord_1),
            "coord_2": (["sample", "feat_1", "feat_2"], coord_2),
        },
    )

    target_1 = Target(
        coord="coord_1",
        transform_func=LabelBinarizer().fit_transform,
        dim="sample",
    )(X_ds)
    target_2 = Target(coord="coord_2",
                      dim=["sample", "feat_1"],
                      reduce_func=np.mean)(X_ds)

    npt.assert_equal(target_1, LabelBinarizer().fit_transform(coord_1[:, 0]))
    npt.assert_equal(target_2, np.mean(coord_2, 2))
예제 #5
0
def test_str():

    assert str(Target()).startswith(
        'Unassigned sklearn_xarray.Target without coordinate')

    assert str(Target(coord='test')).startswith(
        'Unassigned sklearn_xarray.Target with coordinate "test"')

    assert str(Target()(
        np.ones(10))).startswith('sklearn_xarray.Target with data:')
예제 #6
0
def test_is_target():

    target = Target()

    assert is_target(target)

    not_a_target = 1

    assert not is_target(not_a_target)
예제 #7
0
def test_constructor():

    from sklearn_xarray.utils import convert_to_ndarray

    coord_1 = ["a"] * 51 + ["b"] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {"var_1": (["sample", "feature"], np.random.random((100, 10)))},
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample"], coord_1),
            "coord_2": (["sample"], coord_2),
        },
    )

    target = Target(transform_func=convert_to_ndarray)
    target.assign_to(X_ds)

    npt.assert_equal(target.values, np.array(X_ds.var_1))

    target = Target(coord="coord_1", transformer=LabelBinarizer())(X_ds)

    npt.assert_equal(target.values, LabelBinarizer().fit_transform(coord_1))
예제 #8
0
def test_array():

    coord_1 = ['a'] * 51 + ['b'] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {'var_1': (['sample', 'feature'], np.random.random((100, 10)))},
        coords={
            'sample': range(100),
            'feature': range(10),
            'coord_1': (['sample'], coord_1),
            'coord_2': (['sample'], coord_2)
        })

    target = Target(coord='coord_1',
                    transform_func=LabelBinarizer().fit_transform,
                    lazy=True)(X_ds)

    npt.assert_equal(np.array(target), LabelBinarizer().fit_transform(coord_1))
예제 #9
0
def test_shape_and_ndim():

    coord_1 = ['a'] * 51 + ['b'] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {'var_1': (['sample', 'feature'], np.random.random((100, 10)))},
        coords={
            'sample': range(100),
            'feature': range(10),
            'coord_1': (['sample'], coord_1),
            'coord_2': (['sample'], coord_2)
        })

    target = Target(coord='coord_1',
                    transform_func=LabelBinarizer().fit_transform)(X_ds)

    npt.assert_equal(target.shape,
                     LabelBinarizer().fit_transform(coord_1).shape)

    npt.assert_equal(target.ndim, LabelBinarizer().fit_transform(coord_1).ndim)
예제 #10
0
def test_array():

    coord_1 = ["a"] * 51 + ["b"] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {"var_1": (["sample", "feature"], np.random.random((100, 10)))},
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample"], coord_1),
            "coord_2": (["sample"], coord_2),
        },
    )

    target = Target(
        coord="coord_1",
        transform_func=LabelBinarizer().fit_transform,
        lazy=True,
    )(X_ds)

    npt.assert_equal(np.array(target), LabelBinarizer().fit_transform(coord_1))
예제 #11
0
def test_shape_and_ndim():

    coord_1 = ["a"] * 51 + ["b"] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {"var_1": (["sample", "feature"], np.random.random((100, 10)))},
        coords={
            "sample": range(100),
            "feature": range(10),
            "coord_1": (["sample"], coord_1),
            "coord_2": (["sample"], coord_2),
        },
    )

    target = Target(coord="coord_1",
                    transform_func=LabelBinarizer().fit_transform)(X_ds)

    npt.assert_equal(target.shape,
                     LabelBinarizer().fit_transform(coord_1).shape)

    npt.assert_equal(target.ndim, LabelBinarizer().fit_transform(coord_1).ndim)
예제 #12
0
def test_constructor():

    from sklearn_xarray.utils import convert_to_ndarray

    coord_1 = ['a'] * 51 + ['b'] * 49
    coord_2 = list(range(10)) * 10

    X_ds = xr.Dataset(
        {'var_1': (['sample', 'feature'], np.random.random((100, 10)))},
        coords={
            'sample': range(100),
            'feature': range(10),
            'coord_1': (['sample'], coord_1),
            'coord_2': (['sample'], coord_2)
        })

    target = Target(transform_func=convert_to_ndarray)
    target.assign_to(X_ds)

    npt.assert_equal(target.values, np.array(X_ds.var_1))

    target = Target(coord='coord_1', transformer=LabelBinarizer())(X_ds)

    npt.assert_equal(target.values, LabelBinarizer().fit_transform(coord_1))
#
# .. tip::
#
#     To use multi-processing, set ``n_jobs=-1``.

gs = GridSearchCV(
    pl, cv=cv, n_jobs=1, verbose=1, param_grid={
        'pca__n_components': [10, 20]
    })

##############################################################################
# The label to classify is the activity which we convert to an integer
# representation for the classification.

y = Target(coord='activity',
           transform_func=LabelEncoder().fit_transform,
           dim='sample')(X)

##############################################################################
# Finally, we run the grid search and print out the best parameter combination.

if __name__ == '__main__':  # in order for n_jobs=-1 to work on Windows
    gs.fit(X, y)
    print('Best parameters: {0}'.format(gs.best_params_))
    print('Accuracy: {0}'.format(gs.best_score_))

##############################################################################
# .. note::
#
#     The performance of this classifier is obviously pretty bad,
#     it was chosen for execution speed, not accuracy.
예제 #14
0
    def fit(self,
            data,
            labels,
            clf,
            grid_search=False,
            targ_name='targ',
            targ_dim_name='sample',
            col=None):
        """
        Fits a classifier given class labels

        Args:
            data (DataArray): The data to predict on.
            labels (str | Path | GeoDataFrame): Class labels as polygon geometry.
            clf (object): The classifier or classification pipeline.
            grid_search (Optional[bool]): Whether to use cross-validation.
            targ_name (Optional[str]): The target name.
            targ_dim_name (Optional[str]): The target coordinate name.
            col (Optional[str]): The column in ``labels`` you want to assign values from.
                If ``None``, creates a binary raster.

        Returns:
            ``xarray.DataArray``, ``object``:

                    Reshaped `data`, classifier object

        Example:
            >>> import geowombat as gw
            >>> from geowombat.data import l8_224078_20200518, l8_224078_20200518_polygons
            >>> from geowombat.ml import fit
            >>>
            >>> import geopandas as gpd
            >>> from sklearn_xarray.preprocessing import Featurizer
            >>> from sklearn.pipeline import Pipeline
            >>> from sklearn.preprocessing import StandardScaler, LabelEncoder
            >>> from sklearn.decomposition import PCA
            >>> from sklearn.naive_bayes import GaussianNB
            >>>
            >>> le = LabelEncoder()
            >>>
            >>> labels = gpd.read_file(l8_224078_20200518_polygons)
            >>> labels['lc'] = le.fit(labels.name).transform(labels.name)
            >>>
            >>> # Use a data pipeline
            >>> pl = Pipeline([('featurizer', Featurizer()),
            >>>                ('scaler', StandardScaler()),
            >>>                ('pca', PCA()),
            >>>                ('clf', GaussianNB())])
            >>>
            >>> with gw.open(l8_224078_20200518) as src:
            >>>     X, clf = fit(src, labels, pl, grid_search=True, col='lc')
        """

        data = self._prepare_labels(data, labels, col, targ_name)
        X, Xna = self._prepare_predictors(data, targ_name)
        clf = self._prepare_classifiers(clf)

        if grid_search:
            clf = self.grid_search_cv(clf)

        # TODO: should we be using lazy=True?
        y = Target(coord=targ_name,
                   transform_func=LabelEncoder().fit_transform,
                   dim=targ_dim_name)(Xna)

        clf.fit(Xna, y)

        return X, clf
# .. tip::
#
#     To use multi-processing, set ``n_jobs=-1``.

gs = GridSearchCV(pl,
                  cv=cv,
                  n_jobs=1,
                  verbose=1,
                  param_grid={"pca__n_components": [10, 20]})

##############################################################################
# The label to classify is the activity which we convert to an integer
# representation for the classification.

y = Target(coord="activity",
           transform_func=LabelEncoder().fit_transform,
           dim="sample")(X)

##############################################################################
# Finally, we run the grid search and print out the best parameter combination.

if __name__ == "__main__":  # in order for n_jobs=-1 to work on Windows
    gs.fit(X, y)
    print("Best parameters: {0}".format(gs.best_params_))
    print("Accuracy: {0}".format(gs.best_score_))

##############################################################################
# .. note::
#
#     The performance of this classifier is obviously pretty bad,
#     it was chosen for execution speed, not accuracy.