def test_partial_fit_transform(): t = steps.Transform(IncrementalPCA(n_components=3), partial_fit_batches=3) trans, y, sample_weight = t.fit_transform(X) _run_assertions(trans, y, sample_weight) t2 = steps.Transform(IncrementalPCA(n_components=3), partial_fit_batches=3) with pytest.raises(TypeError): t2.partial_fit = None # will try to call this and get TypeError t2.fit_transform(X)
def test_fit(): t = steps.Transform(IncrementalPCA(n_components=3), partial_fit_batches=2) fitted = t.fit(X) assert isinstance(fitted, steps.Transform) assert isinstance(fitted._estimator, IncrementalPCA) trans, y, sample_weight = fitted.transform(X) _run_assertions(trans, y, sample_weight)
def test_pipeline_new_with_params(): p = Pipeline([ steps.SelectCanvas('band_1'), steps.Flatten(), ('pca', steps.Transform(IncrementalPCA(n_components=3))), ('kmeans', KMeans(n_clusters=4)) ]) p.fit(random_elm_store()) p.predict(random_elm_store()) assert p.steps[-1][-1].cluster_centers_.shape[0] == 4 p2 = p.new_with_params(kmeans__n_clusters=7, pca__n_components=2) with pytest.raises(NotFittedError): p2.predict(random_elm_store()) p2.fit(random_elm_store()) assert p2.steps[-1][-1].cluster_centers_.shape[0] == 7
def test_kmeans_model_selection(client=None): pipe = Pipeline([ steps.Flatten(), ('pca', steps.Transform(IncrementalPCA())), ('kmeans', MiniBatchKMeans(n_clusters=5)) ], scoring=kmeans_aic, scoring_kwargs={'score_weights': [-1]}) def samp(*args, **kwargs): return random_elm_store(bands=12, mn=0, mx=1, height=20, width=40) en = ENSEMBLE_KWARGS.copy() n_clusters_choices = list(range(3, 10)) def init(pipe, **kwargs): estimators = [] for _ in range(100): n_components = np.random.choice(np.arange(2, 6)) n_clusters = np.random.choice(n_clusters_choices) estimator = copy.deepcopy(pipe) estimator.set_params(kmeans__n_clusters=n_clusters, pca__n_components=n_components) estimators.append(estimator) return estimators en['ngen'] = 20 en['model_scoring'] = kmeans_aic en['ensemble_init_func'] = init en['model_selection_kwargs'] = dict(drop_n=30, evolve_n=30, choices=n_clusters_choices) en['model_selection'] = kmeans_model_averaging sa = SAMPLER_DATA_SOURCE.copy() sa['sampler'] = samp en.update(sa) fitted = pipe.fit_ensemble(**en) assert len(fitted.ensemble) == en['saved_ensemble_size'] preds = fitted.predict_many(**sa) assert len(preds) == len(fitted.ensemble) * len( SAMPLER_DATA_SOURCE['args_list'])
def test_fit_transform(): t = steps.Transform(IncrementalPCA(n_components=3)) trans, y, sample_weight = t.fit_transform(X) _run_assertions(trans, y, sample_weight)
(0.1, 1.1), (0.2, 1.2), (1, 2), ] weights = ('weights', steps.ModifySample(add_sample_weight)) log = ('log', steps.ModifySample(log_scaler)) preamble = lambda: [ diff_in_time, flat_step, drop_na_step, get_y_step, weights, ] linear = lambda: ('estimator', LinearRegression(n_jobs=-1)) pca = lambda: ('pca', steps.Transform(PCA())) n_components = [None, 4, 6, 8, 10] scalers = zip(('MinMaxScaler', 'RobustScaler', 'StandardScaler', 'None'), (minmax, robust, standard, None)) estimators = zip(('LinearRegression', ), (linear, )) def main(): ''' Beginning on START_DATE, step forward hourly, training on last hour's NLDAS FORA dataset with transformers in a 2-layer hierarchical ensemble, training on the last hour of data and making out-of-training-sample predictions for the current hour. Makes a dill dump file for each hour run. Runs fro NSTEPS hour steps. ''' date = START_DATE
{'search_key': 'long_name', 'search_value': "Band 10 ", 'name': 'band_10'}, {'search_key': 'long_name', 'search_value': "Band 11 ", 'name': 'band_11'}])) HDF4_FILES = [f for f in glob.glob(os.path.join(ELM_EXAMPLE_DATA_PATH, 'hdf4', '*hdf')) if meta_is_day(load_hdf4_meta(f))] def sampler(fname, **kw): return (load_array(fname, band_specs=band_specs), None, None) data_source = { 'sampler': sampler, 'args_list': HDF4_FILES, } pipeline_steps = [steps.Flatten(), ('scaler', steps.StandardScaler()), ('pca', steps.Transform(IncrementalPCA(n_components=4), partial_fit_batches=2)), ('kmeans', MiniBatchKMeans(n_clusters=4, compute_labels=True)),] pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic, scoring_kwargs=dict(score_weights=[-1])) def ensemble_init_func(pipe, **kw): return [pipe.new_with_params(kmeans__n_clusters=np.random.choice(range(6, 10))) for _ in range(4)] ensemble_kwargs = { 'model_selection': kmeans_model_averaging, 'model_selection_kwargs': { 'drop_n': 2, 'evolve_n': 2, },
def make_pipeline_steps(config, pipeline): '''Turn the config's "pipeline" into a list of steps to pass to elm.pipeline.Pipeline Params: :config: validated config from elm.config.ConfigParser :step: a dictionary that is one step of a "pipeline" list This is used by :func:``elm.pipeline.parse_run_config`` ''' actions = [] for action_idx, action in enumerate(pipeline): is_dic = isinstance(action, dict) if not is_dic: step_cls = action elif 'feature_selection' in action: _feature_selection = copy.deepcopy( config.feature_selection[action['feature_selection']]) kw = _feature_selection.copy() kw.update(action) scaler = _feature_selection['method'] scaler = import_callable(getattr(skfeat, scaler, scaler)) if 'func_kwargs' in _feature_selection: func = import_callable(_feature_selection['func']) scaler = partial(func, feature_selection['func_kwargs']) _feature_selection['func'] = func kw = { k: v for k, v in _feature_selection.items() if k not in ('func_kwargs', 'method') } cls = SKLEARN_PREPROCESSING[_feature_selection['method']] step_name = action['feature_selection'] step_cls = cls(**kw) elif 'transform' in action: trans = config.transform[action['transform']] cls = import_callable(trans['model_init_class']) kw = trans.get('model_init_kwargs') or {} kw_filter = { k: v for k, v in kw.items() if k != 'partial_fit_batches' } t = cls(**kw_filter) pfb = trans.get('partial_fit_batches', kw.get('partial_fit_batches')) step_name = action['transform'] step_cls = steps.Transform(t, partial_fit_batches=pfb) elif 'sklearn_preprocessing' in action: _sklearn_preprocessing = config.sklearn_preprocessing[ action['sklearn_preprocessing']] scaler = _sklearn_preprocessing['method'] scaler = getattr(skpre, scaler, scaler) kw = { k: v for k, v in _sklearn_preprocessing.items() if not k in ('method', 'func_kwargs') } if 'func' in _sklearn_preprocessing: kw['func'] = import_callable(_sklearn_preprocessing['func']) cls = SKLEARN_PREPROCESSING[_sklearn_preprocessing['method']] step_name = action['sklearn_preprocessing'] step_cls = cls(**kw) elif any(k in CHANGE_COORDS_ACTIONS for k in action): _sp_step = [k for k in action if k in CHANGE_COORDS_ACTIONS][0] step_name = _sp_step for att in dir(steps): if isinstance(getattr(steps, att), type): if getattr(getattr(steps, att), '_sp_step', None) == _sp_step: step_cls = getattr(steps, att).from_config_dict(**action) break else: # add items to actions of the form: # ( # module_colon_func_name_as_string, # string # args_to_func, # tuple # kwargs_to_func # dict # ) # NOTE also add the key name, like 'transform' to the top of # elm.config.load_config global variable: # "SAMPLE_PIPELINE_ACTIONS" raise NotImplementedError( 'pipeline action {} not recognized.'.format(action)) actions.append((step_name, step_cls)) return actions