示例#1
0
def test_kmeans_simple_X(client=None):
    pipe = Pipeline([steps.Flatten(),
                     MiniBatchKMeans(n_clusters=6)])
    fitted = pipe.fit_ensemble(X=X, **ENSEMBLE_KWARGS)
    _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size'])
    pred = fitted.predict_many(X=X)
    assert len(pred) == len(fitted.ensemble)
示例#2
0
def test_simple():

    p = Pipeline([('a', steps.Flatten())])
    # fit_transform should always return (X, y, sample_weight)
    X, y, sample_weight = p.fit_transform(**data_source)
    assert isinstance(X, ElmStore)
    assert hasattr(X, 'flat')
    assert y is None
    assert sample_weight is None
示例#3
0
def test_feature_selection(feat_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, feat_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline(
        [steps.Flatten(),
         steps.ModifySample(get_y),
         step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)
示例#4
0
def test_kmeans_simple_sampler(client=None):
    pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)])
    kw = SAMPLER_DATA_SOURCE.copy()
    kw.update(ENSEMBLE_KWARGS)
    fitted = pipe.fit_ensemble(**kw)
    ens = fitted.ensemble
    _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size'])
    pred = fitted.predict_many(**SAMPLER_DATA_SOURCE)
    assert len(pred) == len(SAMPLER_DATA_SOURCE['args_list']) * len(ens)
示例#5
0
def test_supervised_feat_select_X_y(client=None):
    '''Has a ModifySample step to get necessary y data'''
    pipe = Pipeline([steps.Flatten(),
            steps.SelectPercentile(score_func=f_classif, percentile=50),
            SGDClassifier()])
    en = dict(method_kwargs=dict(classes=[0, 1, 2]), **ENSEMBLE_KWARGS)
    en.update(X_Y_DATA_SOURCE)
    fitted = pipe.fit_ensemble(**en)
    _train_asserts(fitted, en['saved_ensemble_size'])
    pred = fitted.predict_many(**X_Y_DATA_SOURCE)
    assert len(pred) == len(fitted.ensemble)
示例#6
0
def test_simple():
    p = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=5),])
    args_list = [(100, 200, 5)] * 10 # (height, width, bands)
    data_source = dict(sampler=example_sampler, args_list=args_list)
    ensemble_kw = dict(ngen=2, init_ensemble_size=2)
    ensemble_kw.update(data_source)
    fitted = p.fit_ensemble(**ensemble_kw)
    tagged_fitted_models = fitted.ensemble
    (tag1, model1), (tag2, model2) = tagged_fitted_models # ensemble size of 2 here
    X = example_sampler(100, 400, 5)
    pred1 = model1.predict(X)
    pred2 = model2.predict(X)
    assert pred1.shape == pred2.shape == (400 * 100,)
示例#7
0
def test_pipeline_new_with_params():
    p = Pipeline([
        steps.SelectCanvas('band_1'),
        steps.Flatten(),
        ('pca', steps.Transform(IncrementalPCA(n_components=3))),
        ('kmeans', KMeans(n_clusters=4))
    ])
    p.fit(random_elm_store())
    p.predict(random_elm_store())
    assert p.steps[-1][-1].cluster_centers_.shape[0] == 4
    p2 = p.new_with_params(kmeans__n_clusters=7, pca__n_components=2)
    with pytest.raises(NotFittedError):
        p2.predict(random_elm_store())
    p2.fit(random_elm_store())
    assert p2.steps[-1][-1].cluster_centers_.shape[0] == 7
示例#8
0
def test_modify_sample():
    '''steps.ModifySample should take any function and call it in Pipeline.

    The signature of the function should be:

    func(X, y=None, sample_weight=None, **kwargs)

    and it should return a tuple of:

    (X, y, sample_weight)

    '''
    p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)])
    X, y, sample_weight = p.fit_transform(**data_source)
    assert X is not None
    assert isinstance(y, np.ndarray)
示例#9
0
def test_kmeans_model_selection(client=None):

    pipe = Pipeline([
        steps.Flatten(), ('pca', steps.Transform(IncrementalPCA())),
        ('kmeans', MiniBatchKMeans(n_clusters=5))
    ],
                    scoring=kmeans_aic,
                    scoring_kwargs={'score_weights': [-1]})

    def samp(*args, **kwargs):
        return random_elm_store(bands=12, mn=0, mx=1, height=20, width=40)

    en = ENSEMBLE_KWARGS.copy()
    n_clusters_choices = list(range(3, 10))

    def init(pipe, **kwargs):
        estimators = []
        for _ in range(100):
            n_components = np.random.choice(np.arange(2, 6))
            n_clusters = np.random.choice(n_clusters_choices)
            estimator = copy.deepcopy(pipe)
            estimator.set_params(kmeans__n_clusters=n_clusters,
                                 pca__n_components=n_components)
            estimators.append(estimator)
        return estimators

    en['ngen'] = 20
    en['model_scoring'] = kmeans_aic
    en['ensemble_init_func'] = init
    en['model_selection_kwargs'] = dict(drop_n=30,
                                        evolve_n=30,
                                        choices=n_clusters_choices)
    en['model_selection'] = kmeans_model_averaging
    sa = SAMPLER_DATA_SOURCE.copy()
    sa['sampler'] = samp
    en.update(sa)
    fitted = pipe.fit_ensemble(**en)
    assert len(fitted.ensemble) == en['saved_ensemble_size']
    preds = fitted.predict_many(**sa)
    assert len(preds) == len(fitted.ensemble) * len(
        SAMPLER_DATA_SOURCE['args_list'])
            continue
        X.flat.values[:, j] = np.log10(X.flat.values[:, j])
    return X, y, sample_weight


def add_sample_weight(X, y=None, sample_weight=None, **kw):
    '''Modify this function to return a sample_weight
    if needed.  sample_weight returned should be a 1-D
    NumPy array.  Currently it is weighting the pos/neg deviations.
    '''
    sample_weight = np.abs((y - y.mean()) / y.std())
    return X, y, sample_weight


pipeline_kw = dict(scoring=make_scorer(r_squared_mse))
flat_step = ('flatten', steps.Flatten())
drop_na_step = ('drop_null', steps.DropNaRows())
kw = dict(X_time_steps=X_TIME_STEPS,
          X_time_averaging=X_TIME_AVERAGING,
          difference_cols=DIFFERENCE_COLS)

diff_in_time = ('diff', steps.ModifySample(differencing_integrating, **kw))
get_y_step = ('get_y', steps.ModifySample(partial(get_y, SOIL_MOISTURE)))
robust = lambda: ('normalize', steps.RobustScaler(with_centering=False))
standard = lambda: ('normalize', steps.StandardScaler(with_mean=False))
minmax = lambda minn, maxx: ('minmax',
                             steps.MinMaxScaler(feature_range=(minn, maxx)))
minmax_bounds = [
    (0.01, 1.01),
    (0.05, 1.05),
    (0.1, 1.1),
示例#11
0
def test_sklearn_preproc(scale_encode_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, scale_encode_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)
示例#12
0
    '''
    if isinstance(es, ElmStore):
        # ElmStore test
        y = np.mean(es.flat.values, axis=1)
    else:
        # numpy array test
        y = np.mean(es, axis=1)
    mean_mean = np.mean(y)
    y2 = y.copy()
    y2[y > mean_mean] = 1
    y2[y < mean_mean] = 0
    return (es, y2, sample_weight)


# Example 4 step pipeline
flat_poly_var_kmeans = [('flat', steps.Flatten()),
                        (
                            'poly',
                            steps.PolynomialFeatures(interaction_only=True),
                        ),
                        ('var', steps.VarianceThreshold(threshold=0.00000001)),
                        ('kmeans', KMeans(n_clusters=2))]


def test_simple():

    p = Pipeline([('a', steps.Flatten())])
    # fit_transform should always return (X, y, sample_weight)
    X, y, sample_weight = p.fit_transform(**data_source)
    assert isinstance(X, ElmStore)
    assert hasattr(X, 'flat')
示例#13
0
         {'search_key': 'long_name', 'search_value': "Band 7 ", 'name': 'band_7'},
         {'search_key': 'long_name', 'search_value': "Band 9 ", 'name': 'band_9'},
         {'search_key': 'long_name', 'search_value': "Band 10 ", 'name': 'band_10'},
         {'search_key': 'long_name', 'search_value': "Band 11 ", 'name': 'band_11'}]))
HDF4_FILES = [f for f in glob.glob(os.path.join(ELM_EXAMPLE_DATA_PATH, 'hdf4', '*hdf'))
              if meta_is_day(load_hdf4_meta(f))]

def sampler(fname, **kw):
    return (load_array(fname, band_specs=band_specs), None, None)

data_source = {
    'sampler': sampler,
    'args_list': HDF4_FILES,
}

pipeline_steps = [steps.Flatten(),
                  ('scaler', steps.StandardScaler()),
                  ('pca', steps.Transform(IncrementalPCA(n_components=4), partial_fit_batches=2)),
                  ('kmeans', MiniBatchKMeans(n_clusters=4, compute_labels=True)),]
pipeline = Pipeline(pipeline_steps,
                    scoring=kmeans_aic,
                    scoring_kwargs=dict(score_weights=[-1]))

def ensemble_init_func(pipe, **kw):
    return [pipe.new_with_params(kmeans__n_clusters=np.random.choice(range(6, 10)))
            for _ in range(4)]

ensemble_kwargs = {
    'model_selection': kmeans_model_averaging,
    'model_selection_kwargs': {
        'drop_n': 2,
示例#14
0
from elm.pipeline import Pipeline, steps
from elm.readers import *

from api_example import data_source

ELM_EXAMPLE_DATA_PATH = os.environ['ELM_EXAMPLE_DATA_PATH']


def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
    fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
    y = fitted.predict(X.flat.values)
    return (X, y, sample_weight)


pipeline_steps = [
    steps.Flatten(),
    steps.ModifySample(make_example_y_data),
    ('top_n', steps.SelectPercentile(percentile=80, score_func=f_classif)),
    ('kmeans', MiniBatchKMeans(n_clusters=4))
]
pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic)
param_grid = {
    'kmeans__n_clusters': list(range(5, 10)),
    'control': {
        'select_method': 'selNSGA2',
        'crossover_method': 'cxTwoPoint',
        'mutate_method': 'mutUniformInt',
        'init_pop': 'random',
        'indpb': 0.5,
        'mutpb': 0.9,
        'cxpb': 0.3,