def _run_model_selection(models, model_selection, model_selection_kwargs, ngen, generation, scoring_kwargs): '''Run a model selection after adding ngen and generation to kwargs and finding the right sorting function for fitness Returns: list of (tag, model) tuples''' model_selection_kwargs['ngen'] = ngen model_selection_kwargs['generation'] = generation scoring_kwargs = scoring_kwargs or {} score_weights = (scoring_kwargs or {}).get('score_weights', model_selection_kwargs.get('score_weights')) sort_fitness = scoring_kwargs.get('sort_fitness', model_selection_kwargs.get('sort_fitness')) or None if not sort_fitness: sort_fitness = pareto_front else: sort_fitness = import_callable(sort_fitness) kw = {k: v for k,v in model_selection_kwargs.items() if not k in ('score_weights',)} models = base_selection(models, model_selection=model_selection, sort_fitness=sort_fitness, score_weights=score_weights, **kw) models = _validate_ensemble_members(models) return models
def _import_score_func(self, **params): if 'score_func' in params: if isinstance(params['score_func'], str): sf = getattr(skfeat, params['score_func'], None) if not sf: sf = import_callable(params['score_func']) params['score_func'] = sf return params
def fit_transform(self, X, y=None, sample_weight=None, **kwargs): from elm.sample_util.sample_pipeline import _split_pipeline_output kw = dict(y=y, sample_weight=sample_weight, **kwargs) kw.update(self.kwargs) func = import_callable(self.func) output = func(X, **kw) return _split_pipeline_output(output, X, y, sample_weight, 'ModifySample')
def take_geo_transform_from_meta(band_spec=None, required=True, **meta): if band_spec and getattr(band_spec, 'meta_to_geotransform', False): func = import_callable(band_spec.meta_to_geotransform) geo_transform = func(**meta) if not isinstance(geo_transform, Sequence) or len(geo_transform) != 6: raise ValueError( 'band_spec.meta_to_geotransform {} did not return a sequence of len 6' .format(band_spec.meta_to_geotransform)) return geo_transform elif required: geo_transform = grid_header_to_geo_transform(**meta) return geo_transform return None
def create_sample_from_data_source(config=None, **data_source): '''Given sampling specs in a pipeline train or predict step, return pipe, a list of (func, args, kwargs) actions Params: :train_or_predict_dict: a "train" or "predict" dict from config :config: full config :step: a dictionary that is the current step in the pipeline, like a "train" or "predict" step ''' sampler_func = data_source['sampler'] # TODO: this needs to be # added to ConfigParser # validation (sampler requirement) sampler_func = import_callable(sampler_func) sampler_args = data_source.get('sampler_args') or () if not isinstance(sampler_args, (tuple, list)): sampler_args = (sampler_args, ) reader_name = data_source.get('reader') or None if isinstance(reader_name, str) and reader_name: if config and reader_name in config.readers: reader = config.readers[reader_name] _load_meta = partial(load_meta, reader=reader_name) _load_array = partial(load_array, reader=reader_name) elif isinstance(reader_name, dict): reader = reader_name _load_meta = import_callable(reader['load_meta'], True, reader['load_meta']) _load_array = import_callable(reader['load_array'], True, reader['load_array']) else: _load_array = load_array _load_meta = load_meta data_source['load_meta'] = _load_meta data_source['load_array'] = _load_array for k in data_source: if '_filter' in k and data_source[k] and k != 'geo_filters': data_source[k] = import_callable(data_source[k]) return sampler_func(*sampler_args, **data_source)
def config_to_pipeline(config, client=None): ''' Run the elm config's train and predict "run" actions with dask client and config's updates based on args passed to elm-main, such as --train-only or --predict-only, or edits to ensemble settings, such as --ngen 4 Parameters: :config: elm.config.ConfigParser instance :client: dask client or None ''' from elm.sample_util.sample_pipeline import make_pipeline_steps _makedirs(config) idx_to_evo_params = ea_setup(config) for idx, step in enumerate(config.run): pipeline = step['pipeline'] if 'train' in step: train = config.train[step['train']] pipe_steps = make_pipeline_steps(config, pipeline) cls = import_callable(train['model_init_class']) estimator = cls(**(train.get('model_init_kwargs') or {})) pipe_steps.append((step['train'], estimator)) ensemble_kwargs = train.get('ensemble') if isinstance(ensemble_kwargs, str): ensemble_kwargs = config.ensembles[ensemble_kwargs] ensemble_kwargs['client'] = client data_source = step['data_source'] if not isinstance(data_source, dict): data_source = config.data_sources[data_source] data_source['sampler'] = import_callable(data_source['sampler']) data_source['load_meta'] = load_meta data_source['load_array'] = load_array if callable(data_source.get('args_list')): kw = {k: v for k, v in data_source.items() if k != 'args_list'} data_source['args_list'] = tuple(data_source['args_list'](**kw)) if 'train' in step and not getattr(config, 'PREDICT_ONLY', False): s = train.get('model_scoring') if s: scoring = config.model_scoring[s] scoring_kwargs = { k: v for k, v in scoring.items() if k != 'scoring' } scoring = import_callable(scoring['scoring']) else: scoring = None scoring_kwargs = {} if 'method_kwargs' in train: method_kwargs = train['method_kwargs'] else: method_kwargs = {} if 'classes' in train: method_kwargs['classes'] = train['classes'] ensemble_kwargs['method_kwargs'] = method_kwargs pipe = Pipeline(pipe_steps, scoring=scoring, scoring_kwargs=scoring_kwargs) evo_params = idx_to_evo_params.get(idx, None) if evo_params: kw = dict(evo_params=evo_params) kw.update(data_source) kw.update(ensemble_kwargs) pipe.fit_ea(**kw) else: kw = {} kw.update(data_source) kw.update(ensemble_kwargs) pipe.fit_ensemble(**kw) serialize_pipe(pipe, config.ELM_TRAIN_PATH, step['train']) elif 'predict' in step and not getattr(config, 'TRAIN_ONLY', False): pipe = load_pipe_from_tag(config.ELM_TRAIN_PATH, step['predict']) else: logger.info( 'Do nothing for {} (has no "train" or "predict" key)'.format( step)) if 'predict' in step: # serialize is called with (prediction, sample, tag) serialize = partial(serialize_prediction, config) pipe.predict_many(serialize=serialize, **data_source)
def ensemble(pipe, ngen, X=None, y=None, sample_weight=None, sampler=None, args_list=None, client=None, init_ensemble_size=1, saved_ensemble_size=None, ensemble_init_func=None, models_share_sample=True, model_selection=None, model_selection_kwargs=None, scoring_kwargs=None, method='fit', partial_fit_batches=1, classes=None, method_kwargs=None, **data_source): '''Fit or partial_fit an ensemble of models to a series of samples Call this function from an elm.pipeline.Pipeline instance's methods: "fit_ensemble" "fit_transform_ensemble" "transform_ensemble" Parameters: pipe: instance of elm.pipeline.Pipeline ngen: number of ensemble generations X: earthio.ElmStore, if not using "sampler" and "args_list" y: numpy array if not using "sampler" and "args_list", or None if not needed by Pipeline sample_weight: numpy array if not using "sampler" and "args_list", or None if not needed by Pipeline sampler: Callable - required if not giving X. Called at least once on each element of args_list where each element is unpacked with *one_element_of_args_list args_list: List of args - required if not giving X. See sampler above client: dask-distributed or ThreadPool client init_ensemble_size: number of ensemble members, ignored if giving ensemble_init_func saved_ensemble_size: how many members to keep at final generation ensemble_init_func: Callable to return list of elm.pipeline.Pipeline instances that initialize ensemble models_share_sample: If True, ensure that in each generation, every member is fit to the same sample. If False, fit every model to every sample model_selection: Callable after each generation to take a list of (tag, Pipeline) tuples and return a list of new such tuples, or None or repeatedly train each model on each generation without replacement of model parameters model_selection_kwargs: kwargs passed to model_selection scoring_kwargs: kwargs that are passed to score_one_model. See also elm.model_selection.scoring method: This is the method of Pipeline that called this ensemble function, typically "fit" classes: Unique sequence of class integers passed to supervised classifiers that need the known y classes. method_kwargs: any other arguments to pass to method **data_source: keywords passed to "sampler" if given Returns: new_models: list of (tag, Pipeline instance) tuples on which "predict_many" can be called ''' get_func = _find_get_func_for_client(client) fit_score_kwargs = method_kwargs or {} if not 'classes' in fit_score_kwargs and classes is not None: fit_score_kwargs['classes'] = classes model_selection_kwargs = model_selection_kwargs or {} ensemble_size = init_ensemble_size or 1 partial_fit_batches = partial_fit_batches or 1 if partial_fit_batches > 1: method = 'partial_fit' if not ensemble_init_func: models = tuple(copy.deepcopy(pipe) for _ in range(ensemble_size)) else: ensemble_init_func = import_callable(ensemble_init_func) models = ensemble_init_func(pipe, ensemble_size=ensemble_size) logger.info("Init ensemble: {} members".format(len(models))) if model_selection: model_selection = import_callable(model_selection) final_names = [] dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler, data_source) models = tuple( zip(('tag_{}'.format(idx) for idx in range(len(models))), models)) sample_keys = list(dsk) if models_share_sample: random.shuffle(sample_keys) gen_to_sample_key = { gen: s for gen, s in enumerate(sample_keys[:ngen]) } sample_keys = tuple(sample_keys) for gen in range(ngen): if models_share_sample: sample_keys_passed = (gen_to_sample_key[gen % len(sample_keys)], ) else: sample_keys_passed = sample_keys logger.info('Ensemble generation {} of {} - ({} estimators) '.format( gen + 1, ngen, len(models))) msg = (len(models), len(sample_keys_passed), partial_fit_batches, method, len(models) * len(sample_keys_passed) * partial_fit_batches, gen + 1, ngen) logger.info( 'Ensemble Generation {5} of {6}: ({0} members x {1} samples x {2} calls) = {4} {3} calls this gen' .format(*msg)) dsk, model_keys, new_models_name = _one_generation_dask_graph( dsk, models, fit_score_kwargs, sample_keys_passed, partial_fit_batches, gen, method) if get_func is None: new_models = tuple(dask.get(dsk, new_models_name)) else: new_models = tuple(get_func(dsk, new_models_name)) models = tuple(zip(model_keys, new_models)) logger.info('Trained {} estimators'.format(len(models))) if model_selection: models = _run_model_selection(models, model_selection, model_selection_kwargs or {}, ngen, gen, scoring_kwargs) else: pass # Just training all ensemble members # without replacing / re-ininializing / editing # the model params if saved_ensemble_size: final_models = models[:saved_ensemble_size] else: final_models = models return final_models
def make_pipeline_steps(config, pipeline): '''Turn the config's "pipeline" into a list of steps to pass to elm.pipeline.Pipeline Params: :config: validated config from elm.config.ConfigParser :step: a dictionary that is one step of a "pipeline" list This is used by :func:``elm.pipeline.parse_run_config`` ''' actions = [] for action_idx, action in enumerate(pipeline): is_dic = isinstance(action, dict) if not is_dic: step_cls = action elif 'feature_selection' in action: _feature_selection = copy.deepcopy( config.feature_selection[action['feature_selection']]) kw = _feature_selection.copy() kw.update(action) scaler = _feature_selection['method'] scaler = import_callable(getattr(skfeat, scaler, scaler)) if 'func_kwargs' in _feature_selection: func = import_callable(_feature_selection['func']) scaler = partial(func, feature_selection['func_kwargs']) _feature_selection['func'] = func kw = { k: v for k, v in _feature_selection.items() if k not in ('func_kwargs', 'method') } cls = SKLEARN_PREPROCESSING[_feature_selection['method']] step_name = action['feature_selection'] step_cls = cls(**kw) elif 'transform' in action: trans = config.transform[action['transform']] cls = import_callable(trans['model_init_class']) kw = trans.get('model_init_kwargs') or {} kw_filter = { k: v for k, v in kw.items() if k != 'partial_fit_batches' } t = cls(**kw_filter) pfb = trans.get('partial_fit_batches', kw.get('partial_fit_batches')) step_name = action['transform'] step_cls = steps.Transform(t, partial_fit_batches=pfb) elif 'sklearn_preprocessing' in action: _sklearn_preprocessing = config.sklearn_preprocessing[ action['sklearn_preprocessing']] scaler = _sklearn_preprocessing['method'] scaler = getattr(skpre, scaler, scaler) kw = { k: v for k, v in _sklearn_preprocessing.items() if not k in ('method', 'func_kwargs') } if 'func' in _sklearn_preprocessing: kw['func'] = import_callable(_sklearn_preprocessing['func']) cls = SKLEARN_PREPROCESSING[_sklearn_preprocessing['method']] step_name = action['sklearn_preprocessing'] step_cls = cls(**kw) elif any(k in CHANGE_COORDS_ACTIONS for k in action): _sp_step = [k for k in action if k in CHANGE_COORDS_ACTIONS][0] step_name = _sp_step for att in dir(steps): if isinstance(getattr(steps, att), type): if getattr(getattr(steps, att), '_sp_step', None) == _sp_step: step_cls = getattr(steps, att).from_config_dict(**action) break else: # add items to actions of the form: # ( # module_colon_func_name_as_string, # string # args_to_func, # tuple # kwargs_to_func # dict # ) # NOTE also add the key name, like 'transform' to the top of # elm.config.load_config global variable: # "SAMPLE_PIPELINE_ACTIONS" raise NotImplementedError( 'pipeline action {} not recognized.'.format(action)) actions.append((step_name, step_cls)) return actions