def test_multidim_coord(): coord_1 = np.tile(['a'] * 51 + ['b'] * 49, (10, 1)).T coord_2 = np.random.random((100, 10, 10)) X_ds = xr.Dataset( { 'var_1': (['sample', 'feat_1', 'feat_2'], np.random.random((100, 10, 10))) }, coords={ 'sample': range(100), 'feature': range(10), 'coord_1': (['sample', 'feat_1'], coord_1), 'coord_2': (['sample', 'feat_1', 'feat_2'], coord_2) }) target_1 = Target(coord='coord_1', transform_func=LabelBinarizer().fit_transform, dim='sample')(X_ds) target_2 = Target(coord='coord_2', dim=['sample', 'feat_1'], reduce_func=np.mean)(X_ds) npt.assert_equal(target_1, LabelBinarizer().fit_transform(coord_1[:, 0])) npt.assert_equal(target_2, np.mean(coord_2, 2))
def test_getitem(): coord_1 = ['a'] * 51 + ['b'] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {'var_1': (['sample', 'feature'], np.random.random((100, 10)))}, coords={ 'sample': range(100), 'feature': range(10), 'coord_1': (['sample'], coord_1), 'coord_2': (['sample'], coord_2) }) target = Target(coord='coord_1', transform_func=LabelBinarizer().fit_transform)(X_ds) y_test = target[-1] assert y_test == LabelBinarizer().fit_transform(coord_1)[-1] # test lazy eval target = Target(coord='coord_1', transform_func=LabelBinarizer().fit_transform, lazy=True)(X_ds) y_test = target[-1] assert y_test == LabelBinarizer().fit_transform(coord_1)[-1] assert not y_test.lazy
def test_getitem(): coord_1 = ["a"] * 51 + ["b"] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {"var_1": (["sample", "feature"], np.random.random((100, 10)))}, coords={ "sample": range(100), "feature": range(10), "coord_1": (["sample"], coord_1), "coord_2": (["sample"], coord_2), }, ) target = Target(coord="coord_1", transform_func=LabelBinarizer().fit_transform)(X_ds) y_test = target[-1] assert y_test == LabelBinarizer().fit_transform(coord_1)[-1] # test lazy eval target = Target( coord="coord_1", transform_func=LabelBinarizer().fit_transform, lazy=True, )(X_ds) y_test = target[-1] assert y_test == LabelBinarizer().fit_transform(coord_1)[-1] assert not y_test.lazy
def test_multidim_coord(): coord_1 = np.tile(["a"] * 51 + ["b"] * 49, (10, 1)).T coord_2 = np.random.random((100, 10, 10)) X_ds = xr.Dataset( { "var_1": ( ["sample", "feat_1", "feat_2"], np.random.random((100, 10, 10)), ) }, coords={ "sample": range(100), "feature": range(10), "coord_1": (["sample", "feat_1"], coord_1), "coord_2": (["sample", "feat_1", "feat_2"], coord_2), }, ) target_1 = Target( coord="coord_1", transform_func=LabelBinarizer().fit_transform, dim="sample", )(X_ds) target_2 = Target(coord="coord_2", dim=["sample", "feat_1"], reduce_func=np.mean)(X_ds) npt.assert_equal(target_1, LabelBinarizer().fit_transform(coord_1[:, 0])) npt.assert_equal(target_2, np.mean(coord_2, 2))
def test_str(): assert str(Target()).startswith( 'Unassigned sklearn_xarray.Target without coordinate') assert str(Target(coord='test')).startswith( 'Unassigned sklearn_xarray.Target with coordinate "test"') assert str(Target()( np.ones(10))).startswith('sklearn_xarray.Target with data:')
def test_is_target(): target = Target() assert is_target(target) not_a_target = 1 assert not is_target(not_a_target)
def test_constructor(): from sklearn_xarray.utils import convert_to_ndarray coord_1 = ["a"] * 51 + ["b"] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {"var_1": (["sample", "feature"], np.random.random((100, 10)))}, coords={ "sample": range(100), "feature": range(10), "coord_1": (["sample"], coord_1), "coord_2": (["sample"], coord_2), }, ) target = Target(transform_func=convert_to_ndarray) target.assign_to(X_ds) npt.assert_equal(target.values, np.array(X_ds.var_1)) target = Target(coord="coord_1", transformer=LabelBinarizer())(X_ds) npt.assert_equal(target.values, LabelBinarizer().fit_transform(coord_1))
def test_array(): coord_1 = ['a'] * 51 + ['b'] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {'var_1': (['sample', 'feature'], np.random.random((100, 10)))}, coords={ 'sample': range(100), 'feature': range(10), 'coord_1': (['sample'], coord_1), 'coord_2': (['sample'], coord_2) }) target = Target(coord='coord_1', transform_func=LabelBinarizer().fit_transform, lazy=True)(X_ds) npt.assert_equal(np.array(target), LabelBinarizer().fit_transform(coord_1))
def test_shape_and_ndim(): coord_1 = ['a'] * 51 + ['b'] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {'var_1': (['sample', 'feature'], np.random.random((100, 10)))}, coords={ 'sample': range(100), 'feature': range(10), 'coord_1': (['sample'], coord_1), 'coord_2': (['sample'], coord_2) }) target = Target(coord='coord_1', transform_func=LabelBinarizer().fit_transform)(X_ds) npt.assert_equal(target.shape, LabelBinarizer().fit_transform(coord_1).shape) npt.assert_equal(target.ndim, LabelBinarizer().fit_transform(coord_1).ndim)
def test_array(): coord_1 = ["a"] * 51 + ["b"] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {"var_1": (["sample", "feature"], np.random.random((100, 10)))}, coords={ "sample": range(100), "feature": range(10), "coord_1": (["sample"], coord_1), "coord_2": (["sample"], coord_2), }, ) target = Target( coord="coord_1", transform_func=LabelBinarizer().fit_transform, lazy=True, )(X_ds) npt.assert_equal(np.array(target), LabelBinarizer().fit_transform(coord_1))
def test_shape_and_ndim(): coord_1 = ["a"] * 51 + ["b"] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {"var_1": (["sample", "feature"], np.random.random((100, 10)))}, coords={ "sample": range(100), "feature": range(10), "coord_1": (["sample"], coord_1), "coord_2": (["sample"], coord_2), }, ) target = Target(coord="coord_1", transform_func=LabelBinarizer().fit_transform)(X_ds) npt.assert_equal(target.shape, LabelBinarizer().fit_transform(coord_1).shape) npt.assert_equal(target.ndim, LabelBinarizer().fit_transform(coord_1).ndim)
def test_constructor(): from sklearn_xarray.utils import convert_to_ndarray coord_1 = ['a'] * 51 + ['b'] * 49 coord_2 = list(range(10)) * 10 X_ds = xr.Dataset( {'var_1': (['sample', 'feature'], np.random.random((100, 10)))}, coords={ 'sample': range(100), 'feature': range(10), 'coord_1': (['sample'], coord_1), 'coord_2': (['sample'], coord_2) }) target = Target(transform_func=convert_to_ndarray) target.assign_to(X_ds) npt.assert_equal(target.values, np.array(X_ds.var_1)) target = Target(coord='coord_1', transformer=LabelBinarizer())(X_ds) npt.assert_equal(target.values, LabelBinarizer().fit_transform(coord_1))
# # .. tip:: # # To use multi-processing, set ``n_jobs=-1``. gs = GridSearchCV( pl, cv=cv, n_jobs=1, verbose=1, param_grid={ 'pca__n_components': [10, 20] }) ############################################################################## # The label to classify is the activity which we convert to an integer # representation for the classification. y = Target(coord='activity', transform_func=LabelEncoder().fit_transform, dim='sample')(X) ############################################################################## # Finally, we run the grid search and print out the best parameter combination. if __name__ == '__main__': # in order for n_jobs=-1 to work on Windows gs.fit(X, y) print('Best parameters: {0}'.format(gs.best_params_)) print('Accuracy: {0}'.format(gs.best_score_)) ############################################################################## # .. note:: # # The performance of this classifier is obviously pretty bad, # it was chosen for execution speed, not accuracy.
def fit(self, data, labels, clf, grid_search=False, targ_name='targ', targ_dim_name='sample', col=None): """ Fits a classifier given class labels Args: data (DataArray): The data to predict on. labels (str | Path | GeoDataFrame): Class labels as polygon geometry. clf (object): The classifier or classification pipeline. grid_search (Optional[bool]): Whether to use cross-validation. targ_name (Optional[str]): The target name. targ_dim_name (Optional[str]): The target coordinate name. col (Optional[str]): The column in ``labels`` you want to assign values from. If ``None``, creates a binary raster. Returns: ``xarray.DataArray``, ``object``: Reshaped `data`, classifier object Example: >>> import geowombat as gw >>> from geowombat.data import l8_224078_20200518, l8_224078_20200518_polygons >>> from geowombat.ml import fit >>> >>> import geopandas as gpd >>> from sklearn_xarray.preprocessing import Featurizer >>> from sklearn.pipeline import Pipeline >>> from sklearn.preprocessing import StandardScaler, LabelEncoder >>> from sklearn.decomposition import PCA >>> from sklearn.naive_bayes import GaussianNB >>> >>> le = LabelEncoder() >>> >>> labels = gpd.read_file(l8_224078_20200518_polygons) >>> labels['lc'] = le.fit(labels.name).transform(labels.name) >>> >>> # Use a data pipeline >>> pl = Pipeline([('featurizer', Featurizer()), >>> ('scaler', StandardScaler()), >>> ('pca', PCA()), >>> ('clf', GaussianNB())]) >>> >>> with gw.open(l8_224078_20200518) as src: >>> X, clf = fit(src, labels, pl, grid_search=True, col='lc') """ data = self._prepare_labels(data, labels, col, targ_name) X, Xna = self._prepare_predictors(data, targ_name) clf = self._prepare_classifiers(clf) if grid_search: clf = self.grid_search_cv(clf) # TODO: should we be using lazy=True? y = Target(coord=targ_name, transform_func=LabelEncoder().fit_transform, dim=targ_dim_name)(Xna) clf.fit(Xna, y) return X, clf
# .. tip:: # # To use multi-processing, set ``n_jobs=-1``. gs = GridSearchCV(pl, cv=cv, n_jobs=1, verbose=1, param_grid={"pca__n_components": [10, 20]}) ############################################################################## # The label to classify is the activity which we convert to an integer # representation for the classification. y = Target(coord="activity", transform_func=LabelEncoder().fit_transform, dim="sample")(X) ############################################################################## # Finally, we run the grid search and print out the best parameter combination. if __name__ == "__main__": # in order for n_jobs=-1 to work on Windows gs.fit(X, y) print("Best parameters: {0}".format(gs.best_params_)) print("Accuracy: {0}".format(gs.best_score_)) ############################################################################## # .. note:: # # The performance of this classifier is obviously pretty bad, # it was chosen for execution speed, not accuracy.