예제 #1
0
def test_pipeline_feature_selection():
    tag = selection_name = 'variance_selection'
    config = copy.deepcopy(BASE)
    with tmp_dirs_context(tag) as (train_path, predict_path, cwd):
        for idx, action in enumerate(config['run']):
            if 'train' in action or 'predict' in action:
                train_name = action.get('train', action.get('predict'))
                if 'pipeline' in action:
                    if not isinstance(action['pipeline'], (list, tuple)):
                        action['pipeline'] = config['pipelines'][
                            action['pipeline']]
                    action['pipeline'] += [{
                        'feature_selection': selection_name
                    }]
                else:
                    action['pipeline'] = [{
                        'feature_selection': selection_name
                    }]

                config2 = ConfigParser(config=BASE)
                config2.feature_selection[selection_name] = {
                    'method': 'VarianceThreshold',
                    'score_func': None,
                    'threshold': 0.08,
                }
                X = sampler()
                steps = pipeline.make_pipeline_steps(config2,
                                                     action['pipeline'])
                pipe = Pipeline(steps)
                transform_models = None
                for repeats in range(5):
                    XX, _, _ = pipe.fit_transform(X)
                    assert XX.flat.shape[1] < 40
예제 #2
0
def tst_one_pipeline(pipeline,
                     add_na_per_band=0,
                     na_fields_as_str=True,
                     delim='_'):
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    sample = random_elm_store()
    if add_na_per_band:
        for idx, band in enumerate(sample.data_vars):
            band_arr = getattr(sample, band)
            val = band_arr.values
            inds = np.arange(val.size)
            np.random.shuffle(inds)
            x = inds // val.shape[0]
            y = inds % val.shape[0]
            slc = slice(None, add_na_per_band // 2)
            val[y[slc],x[slc]] = 99 * idx
            band_arr.attrs['missing{}value'.format(delim)] = 99 * idx
            slc = slice(add_na_per_band // 2, add_na_per_band)
            val[y[slc], x[slc]] = 199 * idx
            band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx]
            band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12]
            if na_fields_as_str:
                for field in ('missing{}value', 'invalid{}range', 'valid{}range'):
                    field = field.format(delim)
                    v = band_arr.attrs[field]
                    if isinstance(v, list):
                        band_arr.attrs[field] = ', '.join(map(str,v))
                    else:
                        band_arr.attrs[field] = str(v)
            assert val[np.isnan(val)].size == 0
    config = ConfigParser(config=make_config(pipeline, data_source))
    pipe = Pipeline(make_pipeline_steps(config, pipeline))
    new_es = pipe.fit_transform(sample)
    return sample, new_es[0]
예제 #3
0
def _setup(config=None):
    '''Return the config above and the param_grid'''
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    from elm.pipeline import Pipeline
    if not config:
        config = ConfigParser(config=yaml.load(CONFIG_STR))
    sample_steps = make_pipeline_steps(config, config.run[0]['pipeline'])
    estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))]
    pipe = Pipeline(sample_steps + estimator)
    idx_to_param_grid = ea_setup(config)
    return config, pipe, idx_to_param_grid
예제 #4
0
def config_to_pipeline(config, client=None):
    '''
    Run the elm config's train and predict "run"
    actions with dask client and config's updates
    based on args passed to elm-main, such as --train-only
    or --predict-only, or edits to ensemble settings, such as
    --ngen 4

    Parameters:
        :config: elm.config.ConfigParser instance
        :client: dask client or None
    '''
    from elm.sample_util.sample_pipeline import make_pipeline_steps

    _makedirs(config)
    idx_to_evo_params = ea_setup(config)
    for idx, step in enumerate(config.run):
        pipeline = step['pipeline']
        if 'train' in step:
            train = config.train[step['train']]
            pipe_steps = make_pipeline_steps(config, pipeline)
            cls = import_callable(train['model_init_class'])
            estimator = cls(**(train.get('model_init_kwargs') or {}))
            pipe_steps.append((step['train'], estimator))
            ensemble_kwargs = train.get('ensemble')
            if isinstance(ensemble_kwargs, str):
                ensemble_kwargs = config.ensembles[ensemble_kwargs]
            ensemble_kwargs['client'] = client
        data_source = step['data_source']
        if not isinstance(data_source, dict):
            data_source = config.data_sources[data_source]
        data_source['sampler'] = import_callable(data_source['sampler'])
        data_source['load_meta'] = load_meta
        data_source['load_array'] = load_array
        if callable(data_source.get('args_list')):
            kw = {k: v for k, v in data_source.items() if k != 'args_list'}
            data_source['args_list'] = tuple(data_source['args_list'](**kw))
        if 'train' in step and not getattr(config, 'PREDICT_ONLY', False):
            s = train.get('model_scoring')
            if s:
                scoring = config.model_scoring[s]
                scoring_kwargs = {
                    k: v
                    for k, v in scoring.items() if k != 'scoring'
                }
                scoring = import_callable(scoring['scoring'])
            else:
                scoring = None
                scoring_kwargs = {}
            if 'method_kwargs' in train:
                method_kwargs = train['method_kwargs']
            else:
                method_kwargs = {}
            if 'classes' in train:
                method_kwargs['classes'] = train['classes']
            ensemble_kwargs['method_kwargs'] = method_kwargs
            pipe = Pipeline(pipe_steps,
                            scoring=scoring,
                            scoring_kwargs=scoring_kwargs)
            evo_params = idx_to_evo_params.get(idx, None)
            if evo_params:
                kw = dict(evo_params=evo_params)
                kw.update(data_source)
                kw.update(ensemble_kwargs)
                pipe.fit_ea(**kw)
            else:
                kw = {}
                kw.update(data_source)
                kw.update(ensemble_kwargs)
                pipe.fit_ensemble(**kw)

            serialize_pipe(pipe, config.ELM_TRAIN_PATH, step['train'])
        elif 'predict' in step and not getattr(config, 'TRAIN_ONLY', False):
            pipe = load_pipe_from_tag(config.ELM_TRAIN_PATH, step['predict'])

        else:
            logger.info(
                'Do nothing for {} (has no "train" or "predict" key)'.format(
                    step))
        if 'predict' in step:
            # serialize is called with (prediction, sample, tag)
            serialize = partial(serialize_prediction, config)
            pipe.predict_many(serialize=serialize, **data_source)