def test_pipeline_feature_selection(): tag = selection_name = 'variance_selection' config = copy.deepcopy(BASE) with tmp_dirs_context(tag) as (train_path, predict_path, cwd): for idx, action in enumerate(config['run']): if 'train' in action or 'predict' in action: train_name = action.get('train', action.get('predict')) if 'pipeline' in action: if not isinstance(action['pipeline'], (list, tuple)): action['pipeline'] = config['pipelines'][ action['pipeline']] action['pipeline'] += [{ 'feature_selection': selection_name }] else: action['pipeline'] = [{ 'feature_selection': selection_name }] config2 = ConfigParser(config=BASE) config2.feature_selection[selection_name] = { 'method': 'VarianceThreshold', 'score_func': None, 'threshold': 0.08, } X = sampler() steps = pipeline.make_pipeline_steps(config2, action['pipeline']) pipe = Pipeline(steps) transform_models = None for repeats in range(5): XX, _, _ = pipe.fit_transform(X) assert XX.flat.shape[1] < 40
def tst_one_pipeline(pipeline, add_na_per_band=0, na_fields_as_str=True, delim='_'): from elm.sample_util.sample_pipeline import make_pipeline_steps sample = random_elm_store() if add_na_per_band: for idx, band in enumerate(sample.data_vars): band_arr = getattr(sample, band) val = band_arr.values inds = np.arange(val.size) np.random.shuffle(inds) x = inds // val.shape[0] y = inds % val.shape[0] slc = slice(None, add_na_per_band // 2) val[y[slc],x[slc]] = 99 * idx band_arr.attrs['missing{}value'.format(delim)] = 99 * idx slc = slice(add_na_per_band // 2, add_na_per_band) val[y[slc], x[slc]] = 199 * idx band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx] band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12] if na_fields_as_str: for field in ('missing{}value', 'invalid{}range', 'valid{}range'): field = field.format(delim) v = band_arr.attrs[field] if isinstance(v, list): band_arr.attrs[field] = ', '.join(map(str,v)) else: band_arr.attrs[field] = str(v) assert val[np.isnan(val)].size == 0 config = ConfigParser(config=make_config(pipeline, data_source)) pipe = Pipeline(make_pipeline_steps(config, pipeline)) new_es = pipe.fit_transform(sample) return sample, new_es[0]
def _setup(config=None): '''Return the config above and the param_grid''' from elm.sample_util.sample_pipeline import make_pipeline_steps from elm.pipeline import Pipeline if not config: config = ConfigParser(config=yaml.load(CONFIG_STR)) sample_steps = make_pipeline_steps(config, config.run[0]['pipeline']) estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))] pipe = Pipeline(sample_steps + estimator) idx_to_param_grid = ea_setup(config) return config, pipe, idx_to_param_grid
def config_to_pipeline(config, client=None): ''' Run the elm config's train and predict "run" actions with dask client and config's updates based on args passed to elm-main, such as --train-only or --predict-only, or edits to ensemble settings, such as --ngen 4 Parameters: :config: elm.config.ConfigParser instance :client: dask client or None ''' from elm.sample_util.sample_pipeline import make_pipeline_steps _makedirs(config) idx_to_evo_params = ea_setup(config) for idx, step in enumerate(config.run): pipeline = step['pipeline'] if 'train' in step: train = config.train[step['train']] pipe_steps = make_pipeline_steps(config, pipeline) cls = import_callable(train['model_init_class']) estimator = cls(**(train.get('model_init_kwargs') or {})) pipe_steps.append((step['train'], estimator)) ensemble_kwargs = train.get('ensemble') if isinstance(ensemble_kwargs, str): ensemble_kwargs = config.ensembles[ensemble_kwargs] ensemble_kwargs['client'] = client data_source = step['data_source'] if not isinstance(data_source, dict): data_source = config.data_sources[data_source] data_source['sampler'] = import_callable(data_source['sampler']) data_source['load_meta'] = load_meta data_source['load_array'] = load_array if callable(data_source.get('args_list')): kw = {k: v for k, v in data_source.items() if k != 'args_list'} data_source['args_list'] = tuple(data_source['args_list'](**kw)) if 'train' in step and not getattr(config, 'PREDICT_ONLY', False): s = train.get('model_scoring') if s: scoring = config.model_scoring[s] scoring_kwargs = { k: v for k, v in scoring.items() if k != 'scoring' } scoring = import_callable(scoring['scoring']) else: scoring = None scoring_kwargs = {} if 'method_kwargs' in train: method_kwargs = train['method_kwargs'] else: method_kwargs = {} if 'classes' in train: method_kwargs['classes'] = train['classes'] ensemble_kwargs['method_kwargs'] = method_kwargs pipe = Pipeline(pipe_steps, scoring=scoring, scoring_kwargs=scoring_kwargs) evo_params = idx_to_evo_params.get(idx, None) if evo_params: kw = dict(evo_params=evo_params) kw.update(data_source) kw.update(ensemble_kwargs) pipe.fit_ea(**kw) else: kw = {} kw.update(data_source) kw.update(ensemble_kwargs) pipe.fit_ensemble(**kw) serialize_pipe(pipe, config.ELM_TRAIN_PATH, step['train']) elif 'predict' in step and not getattr(config, 'TRAIN_ONLY', False): pipe = load_pipe_from_tag(config.ELM_TRAIN_PATH, step['predict']) else: logger.info( 'Do nothing for {} (has no "train" or "predict" key)'.format( step)) if 'predict' in step: # serialize is called with (prediction, sample, tag) serialize = partial(serialize_prediction, config) pipe.predict_many(serialize=serialize, **data_source)