def test_kmeans_simple_X(client=None): pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)]) fitted = pipe.fit_ensemble(X=X, **ENSEMBLE_KWARGS) _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size']) pred = fitted.predict_many(X=X) assert len(pred) == len(fitted.ensemble)
def test_simple(): p = Pipeline([('a', steps.Flatten())]) # fit_transform should always return (X, y, sample_weight) X, y, sample_weight = p.fit_transform(**data_source) assert isinstance(X, ElmStore) assert hasattr(X, 'flat') assert y is None assert sample_weight is None
def test_feature_selection(feat_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, feat_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline( [steps.Flatten(), steps.ModifySample(get_y), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)
def test_kmeans_simple_sampler(client=None): pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)]) kw = SAMPLER_DATA_SOURCE.copy() kw.update(ENSEMBLE_KWARGS) fitted = pipe.fit_ensemble(**kw) ens = fitted.ensemble _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size']) pred = fitted.predict_many(**SAMPLER_DATA_SOURCE) assert len(pred) == len(SAMPLER_DATA_SOURCE['args_list']) * len(ens)
def test_supervised_feat_select_X_y(client=None): '''Has a ModifySample step to get necessary y data''' pipe = Pipeline([steps.Flatten(), steps.SelectPercentile(score_func=f_classif, percentile=50), SGDClassifier()]) en = dict(method_kwargs=dict(classes=[0, 1, 2]), **ENSEMBLE_KWARGS) en.update(X_Y_DATA_SOURCE) fitted = pipe.fit_ensemble(**en) _train_asserts(fitted, en['saved_ensemble_size']) pred = fitted.predict_many(**X_Y_DATA_SOURCE) assert len(pred) == len(fitted.ensemble)
def test_simple(): p = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=5),]) args_list = [(100, 200, 5)] * 10 # (height, width, bands) data_source = dict(sampler=example_sampler, args_list=args_list) ensemble_kw = dict(ngen=2, init_ensemble_size=2) ensemble_kw.update(data_source) fitted = p.fit_ensemble(**ensemble_kw) tagged_fitted_models = fitted.ensemble (tag1, model1), (tag2, model2) = tagged_fitted_models # ensemble size of 2 here X = example_sampler(100, 400, 5) pred1 = model1.predict(X) pred2 = model2.predict(X) assert pred1.shape == pred2.shape == (400 * 100,)
def test_pipeline_new_with_params(): p = Pipeline([ steps.SelectCanvas('band_1'), steps.Flatten(), ('pca', steps.Transform(IncrementalPCA(n_components=3))), ('kmeans', KMeans(n_clusters=4)) ]) p.fit(random_elm_store()) p.predict(random_elm_store()) assert p.steps[-1][-1].cluster_centers_.shape[0] == 4 p2 = p.new_with_params(kmeans__n_clusters=7, pca__n_components=2) with pytest.raises(NotFittedError): p2.predict(random_elm_store()) p2.fit(random_elm_store()) assert p2.steps[-1][-1].cluster_centers_.shape[0] == 7
def test_modify_sample(): '''steps.ModifySample should take any function and call it in Pipeline. The signature of the function should be: func(X, y=None, sample_weight=None, **kwargs) and it should return a tuple of: (X, y, sample_weight) ''' p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)]) X, y, sample_weight = p.fit_transform(**data_source) assert X is not None assert isinstance(y, np.ndarray)
def test_kmeans_model_selection(client=None): pipe = Pipeline([ steps.Flatten(), ('pca', steps.Transform(IncrementalPCA())), ('kmeans', MiniBatchKMeans(n_clusters=5)) ], scoring=kmeans_aic, scoring_kwargs={'score_weights': [-1]}) def samp(*args, **kwargs): return random_elm_store(bands=12, mn=0, mx=1, height=20, width=40) en = ENSEMBLE_KWARGS.copy() n_clusters_choices = list(range(3, 10)) def init(pipe, **kwargs): estimators = [] for _ in range(100): n_components = np.random.choice(np.arange(2, 6)) n_clusters = np.random.choice(n_clusters_choices) estimator = copy.deepcopy(pipe) estimator.set_params(kmeans__n_clusters=n_clusters, pca__n_components=n_components) estimators.append(estimator) return estimators en['ngen'] = 20 en['model_scoring'] = kmeans_aic en['ensemble_init_func'] = init en['model_selection_kwargs'] = dict(drop_n=30, evolve_n=30, choices=n_clusters_choices) en['model_selection'] = kmeans_model_averaging sa = SAMPLER_DATA_SOURCE.copy() sa['sampler'] = samp en.update(sa) fitted = pipe.fit_ensemble(**en) assert len(fitted.ensemble) == en['saved_ensemble_size'] preds = fitted.predict_many(**sa) assert len(preds) == len(fitted.ensemble) * len( SAMPLER_DATA_SOURCE['args_list'])
continue X.flat.values[:, j] = np.log10(X.flat.values[:, j]) return X, y, sample_weight def add_sample_weight(X, y=None, sample_weight=None, **kw): '''Modify this function to return a sample_weight if needed. sample_weight returned should be a 1-D NumPy array. Currently it is weighting the pos/neg deviations. ''' sample_weight = np.abs((y - y.mean()) / y.std()) return X, y, sample_weight pipeline_kw = dict(scoring=make_scorer(r_squared_mse)) flat_step = ('flatten', steps.Flatten()) drop_na_step = ('drop_null', steps.DropNaRows()) kw = dict(X_time_steps=X_TIME_STEPS, X_time_averaging=X_TIME_AVERAGING, difference_cols=DIFFERENCE_COLS) diff_in_time = ('diff', steps.ModifySample(differencing_integrating, **kw)) get_y_step = ('get_y', steps.ModifySample(partial(get_y, SOIL_MOISTURE))) robust = lambda: ('normalize', steps.RobustScaler(with_centering=False)) standard = lambda: ('normalize', steps.StandardScaler(with_mean=False)) minmax = lambda minn, maxx: ('minmax', steps.MinMaxScaler(feature_range=(minn, maxx))) minmax_bounds = [ (0.01, 1.01), (0.05, 1.05), (0.1, 1.1),
def test_sklearn_preproc(scale_encode_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, scale_encode_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)
''' if isinstance(es, ElmStore): # ElmStore test y = np.mean(es.flat.values, axis=1) else: # numpy array test y = np.mean(es, axis=1) mean_mean = np.mean(y) y2 = y.copy() y2[y > mean_mean] = 1 y2[y < mean_mean] = 0 return (es, y2, sample_weight) # Example 4 step pipeline flat_poly_var_kmeans = [('flat', steps.Flatten()), ( 'poly', steps.PolynomialFeatures(interaction_only=True), ), ('var', steps.VarianceThreshold(threshold=0.00000001)), ('kmeans', KMeans(n_clusters=2))] def test_simple(): p = Pipeline([('a', steps.Flatten())]) # fit_transform should always return (X, y, sample_weight) X, y, sample_weight = p.fit_transform(**data_source) assert isinstance(X, ElmStore) assert hasattr(X, 'flat')
{'search_key': 'long_name', 'search_value': "Band 7 ", 'name': 'band_7'}, {'search_key': 'long_name', 'search_value': "Band 9 ", 'name': 'band_9'}, {'search_key': 'long_name', 'search_value': "Band 10 ", 'name': 'band_10'}, {'search_key': 'long_name', 'search_value': "Band 11 ", 'name': 'band_11'}])) HDF4_FILES = [f for f in glob.glob(os.path.join(ELM_EXAMPLE_DATA_PATH, 'hdf4', '*hdf')) if meta_is_day(load_hdf4_meta(f))] def sampler(fname, **kw): return (load_array(fname, band_specs=band_specs), None, None) data_source = { 'sampler': sampler, 'args_list': HDF4_FILES, } pipeline_steps = [steps.Flatten(), ('scaler', steps.StandardScaler()), ('pca', steps.Transform(IncrementalPCA(n_components=4), partial_fit_batches=2)), ('kmeans', MiniBatchKMeans(n_clusters=4, compute_labels=True)),] pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic, scoring_kwargs=dict(score_weights=[-1])) def ensemble_init_func(pipe, **kw): return [pipe.new_with_params(kmeans__n_clusters=np.random.choice(range(6, 10))) for _ in range(4)] ensemble_kwargs = { 'model_selection': kmeans_model_averaging, 'model_selection_kwargs': { 'drop_n': 2,
from elm.pipeline import Pipeline, steps from elm.readers import * from api_example import data_source ELM_EXAMPLE_DATA_PATH = os.environ['ELM_EXAMPLE_DATA_PATH'] def make_example_y_data(X, y=None, sample_weight=None, **kwargs): fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values) y = fitted.predict(X.flat.values) return (X, y, sample_weight) pipeline_steps = [ steps.Flatten(), steps.ModifySample(make_example_y_data), ('top_n', steps.SelectPercentile(percentile=80, score_func=f_classif)), ('kmeans', MiniBatchKMeans(n_clusters=4)) ] pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic) param_grid = { 'kmeans__n_clusters': list(range(5, 10)), 'control': { 'select_method': 'selNSGA2', 'crossover_method': 'cxTwoPoint', 'mutate_method': 'mutUniformInt', 'init_pop': 'random', 'indpb': 0.5, 'mutpb': 0.9, 'cxpb': 0.3,