def predict_many(data_source, saved_model_tag=None, ensemble=None, client=None, serialize=None, to_raster=True, elm_predict_path=None): '''See elm.pipeline.Pipeline.predict_many method ''' env = parse_env_vars() elm_predict_path = elm_predict_path or env.get('ELM_PREDICT_PATH') if serialize and elm_predict_path and not os.path.exists(elm_predict_path): os.mkdir(elm_predict_path) pipe_example = ensemble[0][1] ds = data_source.copy() X = ds.pop('X', None) y = ds.pop('y', None) args_list = ds.pop('args_list', None) sampler = ds.pop('sampler', None) dsk = make_samples_dask(X, y, None, pipe_example, args_list, sampler, ds) sample_keys = tuple(dsk) args_list = tuple(itertools.product(sample_keys, ensemble)) keys = [] last_file_name = None for idx, (sample_key, (estimator_tag, estimator)) in enumerate(args_list): name = _next_name('predict_many') predict_tag = '{}-{}'.format(estimator_tag, sample_key) if saved_model_tag: predict_tag += '-' + saved_model_tag dsk[name] = ( _predict_one_sample_one_arg, estimator, serialize, to_raster, predict_tag, elm_predict_path, sample_key, ) keys.append(name) logger.info('Predict {} estimator(s) and {} sample(s) ' '({} combination[s])'.format(len(ensemble), len(sample_keys), len(args_list))) preds = [] if client is None: new = dask.get(dsk, keys) else: new = client.get(dsk, keys) return tuple(itertools.chain.from_iterable(new))
def evolve_train(pipe, evo_params, X=None, y=None, sample_weight=None, sampler=None, args_list=None, client=None, init_ensemble_size=1, saved_ensemble_size=None, ensemble_init_func=None, scoring_kwargs=None, method='fit', partial_fit_batches=1, classes=None, method_kwargs=None, **data_source): '''evolve_train runs an evolutionary algorithm to find the most fit elm.pipeline.Pipeline instances Parameters: pipe: elm.pipeline.Pipeline instance evo_params: the EvoParams instance, typically from from elm.model_selection import ea_setup evo_params = ea_setup(param_grid=param_grid, param_grid_name='param_grid_example', score_weights=[-1]) # minimization See also the help from (elm.pipeline.ensemble) where most arguments are interpretted similary. ''' + ensemble.__doc__ models_share_sample = True method_kwargs = method_kwargs or {} scoring_kwargs = scoring_kwargs or {} get_func = _find_get_func_for_client(client) control = evo_params.deap_params['control'] required_args, _, _ = get_args_kwargs_defaults(ea_general) evo_args = [ evo_params, ] data_source = dict(X=X, y=y, sample_weight=sample_weight, sampler=sampler, args_list=args_list, **data_source) fit_one_generation = partial(_on_each_generation, pipe, data_source, evo_params.deap_params, get_func, partial_fit_batches, method, method_kwargs) dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler, data_source) sample_keys = list(dsk) if models_share_sample: np.random.shuffle(sample_keys) gen_to_sample_key = lambda gen: [sample_keys[gen]] else: gen_to_sample_key = lambda gen: sample_keys sample_keys = tuple(sample_keys) try: param_history = [] for a in required_args[1:]: if a not in control: raise ValueError('Expected {} in {} (control kwargs ' 'to evolutionary ' 'algorithm)'.format(a, control)) evo_args.append(control[a]) ea_gen = ea_general(*evo_args) pop, _, _ = next(ea_gen) sample_keys_passed = gen_to_sample_key(0) def log_once(len_models, sample_keys_passed, gen): total_calls = len_models * len( sample_keys_passed) * partial_fit_batches msg = (len_models, len(sample_keys_passed), partial_fit_batches, method, gen, total_calls) fmt = 'Evolve generation {4}: {0} models x {1} samples x {2} {3} calls = {5} calls in total' logger.info(fmt.format(*msg)) log_once(len(pop), sample_keys_passed, 0) pop_names = [ind.name for ind in pop] models, fitnesses = fit_one_generation(dsk, 0, sample_keys_passed, pop) assign_check_fitness(pop, fitnesses, param_history, evo_params.deap_params['choices'], evo_params.score_weights) invalid_ind = True fitted_models = {n: m for n, (_, m) in zip(pop_names, models)} ngen = evo_params.deap_params['control'].get('ngen') or None if not ngen and not evo_params.early_stop: raise ValueError('param_grids: pg_name: control: has neither ' 'ngen or early_stop keys') elif not ngen: ngen = 1000000 for gen in range(ngen): # on last generation invalid_ind becomes None # and breaks this loop if models_share_sample: sample_keys_passed = (gen_to_sample_key(gen % len(sample_keys)), ) else: sample_keys_passed = sample_keys if gen > 0: log_once(len(invalid_ind), sample_keys_passed, gen) names = [ind.name for ind in invalid_ind] models, fitnesses = fit_one_generation(dsk, gen, sample_keys_passed, invalid_ind) fitted_models.update( {n: m for n, (_, m) in zip(names, models)}) (pop, invalid_ind, param_history) = ea_gen.send(fitnesses) pop_names = [ind.name for ind in pop] fitted_models = { k: v for k, v in fitted_models.items() if k in pop_names } if not invalid_ind: break # If there are no new solutions to try, break pop = evo_params.toolbox.select(pop, saved_ensemble_size) pop_names = [ind.name for ind in pop] models = [(k, v) for k, v in fitted_models.items() if k in pop_names] finally: columns = list(evo_params.deap_params['param_order']) columns += [ 'objective_{}_{}'.format(idx, 'min' if sw == -1 else 'max') for idx, sw in enumerate(evo_params.score_weights) ] if param_history: assert len(columns) == len(param_history[0]) param_history = pd.DataFrame(np.array(param_history), columns=columns) param_history.to_csv(evo_params.history_file, index_label='parameter_set') return models
def ensemble(pipe, ngen, X=None, y=None, sample_weight=None, sampler=None, args_list=None, client=None, init_ensemble_size=1, saved_ensemble_size=None, ensemble_init_func=None, models_share_sample=True, model_selection=None, model_selection_kwargs=None, scoring_kwargs=None, method='fit', partial_fit_batches=1, classes=None, method_kwargs=None, **data_source): '''Fit or partial_fit an ensemble of models to a series of samples Call this function from an elm.pipeline.Pipeline instance's methods: "fit_ensemble" "fit_transform_ensemble" "transform_ensemble" Parameters: pipe: instance of elm.pipeline.Pipeline ngen: number of ensemble generations X: earthio.ElmStore, if not using "sampler" and "args_list" y: numpy array if not using "sampler" and "args_list", or None if not needed by Pipeline sample_weight: numpy array if not using "sampler" and "args_list", or None if not needed by Pipeline sampler: Callable - required if not giving X. Called at least once on each element of args_list where each element is unpacked with *one_element_of_args_list args_list: List of args - required if not giving X. See sampler above client: dask-distributed or ThreadPool client init_ensemble_size: number of ensemble members, ignored if giving ensemble_init_func saved_ensemble_size: how many members to keep at final generation ensemble_init_func: Callable to return list of elm.pipeline.Pipeline instances that initialize ensemble models_share_sample: If True, ensure that in each generation, every member is fit to the same sample. If False, fit every model to every sample model_selection: Callable after each generation to take a list of (tag, Pipeline) tuples and return a list of new such tuples, or None or repeatedly train each model on each generation without replacement of model parameters model_selection_kwargs: kwargs passed to model_selection scoring_kwargs: kwargs that are passed to score_one_model. See also elm.model_selection.scoring method: This is the method of Pipeline that called this ensemble function, typically "fit" classes: Unique sequence of class integers passed to supervised classifiers that need the known y classes. method_kwargs: any other arguments to pass to method **data_source: keywords passed to "sampler" if given Returns: new_models: list of (tag, Pipeline instance) tuples on which "predict_many" can be called ''' get_func = _find_get_func_for_client(client) fit_score_kwargs = method_kwargs or {} if not 'classes' in fit_score_kwargs and classes is not None: fit_score_kwargs['classes'] = classes model_selection_kwargs = model_selection_kwargs or {} ensemble_size = init_ensemble_size or 1 partial_fit_batches = partial_fit_batches or 1 if partial_fit_batches > 1: method = 'partial_fit' if not ensemble_init_func: models = tuple(copy.deepcopy(pipe) for _ in range(ensemble_size)) else: ensemble_init_func = import_callable(ensemble_init_func) models = ensemble_init_func(pipe, ensemble_size=ensemble_size) logger.info("Init ensemble: {} members".format(len(models))) if model_selection: model_selection = import_callable(model_selection) final_names = [] dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler, data_source) models = tuple( zip(('tag_{}'.format(idx) for idx in range(len(models))), models)) sample_keys = list(dsk) if models_share_sample: random.shuffle(sample_keys) gen_to_sample_key = { gen: s for gen, s in enumerate(sample_keys[:ngen]) } sample_keys = tuple(sample_keys) for gen in range(ngen): if models_share_sample: sample_keys_passed = (gen_to_sample_key[gen % len(sample_keys)], ) else: sample_keys_passed = sample_keys logger.info('Ensemble generation {} of {} - ({} estimators) '.format( gen + 1, ngen, len(models))) msg = (len(models), len(sample_keys_passed), partial_fit_batches, method, len(models) * len(sample_keys_passed) * partial_fit_batches, gen + 1, ngen) logger.info( 'Ensemble Generation {5} of {6}: ({0} members x {1} samples x {2} calls) = {4} {3} calls this gen' .format(*msg)) dsk, model_keys, new_models_name = _one_generation_dask_graph( dsk, models, fit_score_kwargs, sample_keys_passed, partial_fit_batches, gen, method) if get_func is None: new_models = tuple(dask.get(dsk, new_models_name)) else: new_models = tuple(get_func(dsk, new_models_name)) models = tuple(zip(model_keys, new_models)) logger.info('Trained {} estimators'.format(len(models))) if model_selection: models = _run_model_selection(models, model_selection, model_selection_kwargs or {}, ngen, gen, scoring_kwargs) else: pass # Just training all ensemble members # without replacing / re-ininializing / editing # the model params if saved_ensemble_size: final_models = models[:saved_ensemble_size] else: final_models = models return final_models