Exemplo n.º 1
0
def wrap_select(method, individuals, k, **kwargs):
    '''wraps a selection method such as selNSGA2 from deap.tools

    Parameters:
        :method:       method such as selNSGA2 or selBest
        :individuals:  population of solutions
        :k:            how many to select
        :kwargs:       passed as args to method, e.g.:

            * :fitness_size:   with method selDoubleTournament
            * :parsimony_size: with method selDoubleTournament
            * :fitness_first:  with method selDoubleTournament
            * :tournsize:      with method selTournament

    '''
    sel = getattr(tools, method, None)
    if not sel:
        raise ValueError('Expected {} to be an attribute of deap.tools'.format(method))
    required_args, _, _ = get_args_kwargs_defaults(sel)
    args = [individuals, k]
    if len(required_args) > 2:
        for a in required_args[2:]:
            if not a in kwargs:
                raise ValueError('Expected control kwargs {} to have {} for method {}'.format(kwargs, a, method))
            args.append(kwargs[a])
    return sel(*args)
Exemplo n.º 2
0
def select_from_file(*sampler_args,
                     band_specs=None,
                     metadata_filter=None,
                     filename_filter=None,
                     filename_search=None,
                     dry_run=False,
                     load_meta=None,
                     load_array=None,
                     **kwargs):

    '''select_from_file is the typical sampler used in the elm config
    file interface system via elm.pipeline.parse_run_config

    Parameters:
        :sampler_args: tuple of one element - a filename
        :band_specs: list of band_specs included in a data_source
        :metadata_filter: ignored
        :filename_search: a search token for a filenames
        :filename_filter: a function that returns True/False to keep filename
        :dry_run:  if True, don't actually read file, just return True if accepted
        :load_meta: Function, typically from elm.readers, to load metadata
        :load_array: Function, typically from elm.readers, to load ElmStore
        :kwargs: may contain "reader" such as "hdf4", "tif", "hdf5", "netcdf"

    '''
    filename = sampler_args[0]
    keep_file = _filename_filter(filename,
                                 search=filename_search,
                                 func=filename_filter)
    logger.debug('Filename {} keep_file {}'.format(filename, keep_file))
    args_required, default_kwargs, var_keywords = get_args_kwargs_defaults(load_meta)
    if dry_run:
        return True
    sample = load_array(filename, band_specs=band_specs, reader=kwargs.get('reader', None))
    return sample
Exemplo n.º 3
0
def crossover(toolbox, method, ind1, ind2, **kwargs):
    '''crossover two solutions using crossover 'method'

    Parameters:
        :toolbox: deap toolbox
        :ind1:    individual
        :ind2:    individual
        :method:  method name from deap.tools, e.g. cxTwoPoint
        :kwargs:  passed as args where needed to method, e.g.:

            * :alpha: if using cxBlend, cxESBlend
            * :indpb: if using cxUniform or cxUniformPartialyMatched
            * :eta: if using cxSimulatedBinary or cxSimulatedBinaryBounded
            * :low: if using cxSimulatedBinaryBounded
            * :up:  if using cxSimulatedBinaryBounded

    '''
    child1, child2 = [toolbox.clone(ind) for ind in (ind1, ind2)]
    cx = getattr(tools, method, None)
    if not cx:
        raise ValueError('{} method (crossover) is not in deap.tools'.format(method))
    required_args, _, _ = get_args_kwargs_defaults(cx)
    args = [child1, child2]
    if len(required_args) > 2:
        for a in required_args[2:]:
            if a not in kwargs:
                raise ValueError('Expected {} to be in control for '
                                 'param_grid with method {}'.format(a, method))
            args.append(kwargs[a])
    child1, child2 = cx(*args)
    del child1.fitness.values
    del child2.fitness.values
    return (child1, child2)
Exemplo n.º 4
0
def wrap_mutate(method, choices, max_param_retries, individual, **kwargs):
    '''Mutation for the method, choices and other config options

    Parameters:
        :method:  string - imported from deap.tools such as mutUniformInt
        :choices: list of lists choices for each parameter
        :max_param_retries: how many times to retry when getting invalid params
        :individual:        deap Individual parameter ste
        :kwargs:            kwargs passed as args to method given

    Returns:
        :tuple: of one Individual parameter set

    '''
    kwargs = copy.deepcopy(kwargs)
    mut = getattr(tools, method, None)
    if not mut:
        raise ValueError('In wrap_mutate, method - {} is not in deap.tools'.format(method))
    required_args, _, _ = get_args_kwargs_defaults(mut)
    args = [individual]
    if len(required_args) > 1:
        for a in required_args[1:]:
            if a == 'low':
                args.append([0] * len(choices))
            elif a == 'up':
                args.append([len(choice) - 1 for choice in choices])
            else:
                args.append(kwargs[a])
    for retries in range(max_param_retries):
        params = mut(*args)
        if not out_of_bounds(params[0], choices):
            return params
    raise ValueError('wrap_mutate could not find a set of parameters that is within the given choices for the param_grid')
Exemplo n.º 5
0
def import_scorer(scoring):
    '''Import a scoring function or find it in METRICS'''
    if not hasattr(scoring, 'fit'):
        if scoring in METRICS:
            scoring = import_callable(METRICS[scoring])
            requires_y = True
        else:
            scoring = import_callable(scoring)
            required_args, kwargs, has_var_kwargs = get_args_kwargs_defaults(
                scoring)
            requires_y = 'y_true' in required_args
    return (scoring, requires_y)
Exemplo n.º 6
0
    def _validate_train_or_transform_funcs(self, name, t):
        '''Validate functions given in "train" section of config'''
        if not isinstance(t, dict):
            raise ElmConfigError('In train:{} expected a dict '
                                 'but found {}'.format(name, t))
        training_funcs = (
            ('model_init_class', True),
            ('get_y_func', False),
            ('get_weight_func', False),
        )

        has_fit_func = False
        has_funcs = {}
        sel = t.get('model_selection')
        no_selection = not sel
        for f, required in training_funcs:
            cls_or_func = self._validate_custom_callable(
                t.get(f), required, 'train:{} - {}'.format(name, f))
            has_funcs[f] = bool(cls_or_func)
            if f == 'model_init_class':
                model_init_class = cls_or_func
                fit_func = getattr(model_init_class, 'partial_fit',
                                   getattr(model_init_class, 'fit', None))
                if fit_func is None:
                    raise ElmConfigError(
                        'model_init_class {} '
                        'does not have "fit" or "partial_fit" method'.format(
                            t.get('model_init_class')))
                fargs, fkwargs, var_keyword = get_args_kwargs_defaults(
                    cls_or_func.fit)
        requires_y = any(x.lower() == 'y' for x in fargs)
        if not fkwargs.get('sample_weight') and has_funcs['get_weight_func']:
            raise ElmConfigError(
                'train:{} - {} does not support a '
                '"sample_weight" (sample_weights were implied '
                'giving "get_sample_weight" '
                'function {}'.format(name, model_init_class,
                                     t['get_sample_weight']))
        return has_fit_func, requires_y, no_selection
Exemplo n.º 7
0
MODELS_WITH_PREDICT_DICT = {
    k: import_callable(k)
    for k in MODELS_WITH_PREDICT_STR
}

#
DECOMP_PARTIAL_FIT_MODEL_STR = ('sklearn.decomposition:IncrementalPCA', )
DECOMP_MODEL_STR = (
    'sklearn.decomposition:PCA',
    'sklearn.decomposition:ProjectedGradientNMF',
    'sklearn.decomposition:RandomizedPCA',
    'sklearn.decomposition:KernelPCA',
    'sklearn.decomposition:FactorAnalysis',
    'sklearn.decomposition:FastICA',
    'sklearn.decomposition:TruncatedSVD',
    'sklearn.decomposition:NMF',
    'sklearn.decomposition:SparsePCA',
    'sklearn.decomposition:MiniBatchSparsePCA',
    'sklearn.decomposition:DictionaryLearning',
    'sklearn.decomposition:LatentDirichletAllocation',
) + DECOMP_PARTIAL_FIT_MODEL_STR
UNSUPERVISED_MODEL_STR = [
    k for k, v in MODELS_WITH_PREDICT_DICT.items()
    if hasattr(v, 'fit') and 'y' in get_args_kwargs_defaults(v.fit)[1]
]

MODELS_WITH_PREDICT_ESTIMATOR_TYPES = {
    k: getattr(v, '_estimator_type', None)
    for k, v in MODELS_WITH_PREDICT_DICT.items()
}
Exemplo n.º 8
0
def evolve_train(pipe,
                 evo_params,
                 X=None,
                 y=None,
                 sample_weight=None,
                 sampler=None,
                 args_list=None,
                 client=None,
                 init_ensemble_size=1,
                 saved_ensemble_size=None,
                 ensemble_init_func=None,
                 scoring_kwargs=None,
                 method='fit',
                 partial_fit_batches=1,
                 classes=None,
                 method_kwargs=None,
                 **data_source):
    '''evolve_train runs an evolutionary algorithm to
    find the most fit elm.pipeline.Pipeline instances

    Parameters:
        pipe: elm.pipeline.Pipeline instance
        evo_params: the EvoParams instance, typically from
            from elm.model_selection import ea_setup
            evo_params = ea_setup(param_grid=param_grid,
                          param_grid_name='param_grid_example',
                          score_weights=[-1]) # minimization

        See also the help from (elm.pipeline.ensemble) where
        most arguments are interpretted similary.

    ''' + ensemble.__doc__
    models_share_sample = True
    method_kwargs = method_kwargs or {}
    scoring_kwargs = scoring_kwargs or {}
    get_func = _find_get_func_for_client(client)
    control = evo_params.deap_params['control']
    required_args, _, _ = get_args_kwargs_defaults(ea_general)
    evo_args = [
        evo_params,
    ]
    data_source = dict(X=X,
                       y=y,
                       sample_weight=sample_weight,
                       sampler=sampler,
                       args_list=args_list,
                       **data_source)
    fit_one_generation = partial(_on_each_generation, pipe, data_source,
                                 evo_params.deap_params, get_func,
                                 partial_fit_batches, method, method_kwargs)

    dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler,
                            data_source)
    sample_keys = list(dsk)
    if models_share_sample:
        np.random.shuffle(sample_keys)
        gen_to_sample_key = lambda gen: [sample_keys[gen]]
    else:
        gen_to_sample_key = lambda gen: sample_keys
    sample_keys = tuple(sample_keys)

    try:
        param_history = []
        for a in required_args[1:]:
            if a not in control:
                raise ValueError('Expected {} in {} (control kwargs '
                                 'to evolutionary '
                                 'algorithm)'.format(a, control))
            evo_args.append(control[a])
        ea_gen = ea_general(*evo_args)
        pop, _, _ = next(ea_gen)
        sample_keys_passed = gen_to_sample_key(0)

        def log_once(len_models, sample_keys_passed, gen):
            total_calls = len_models * len(
                sample_keys_passed) * partial_fit_batches
            msg = (len_models, len(sample_keys_passed), partial_fit_batches,
                   method, gen, total_calls)
            fmt = 'Evolve generation {4}: {0} models x {1} samples x {2} {3} calls = {5} calls in total'
            logger.info(fmt.format(*msg))

        log_once(len(pop), sample_keys_passed, 0)
        pop_names = [ind.name for ind in pop]
        models, fitnesses = fit_one_generation(dsk, 0, sample_keys_passed, pop)
        assign_check_fitness(pop, fitnesses, param_history,
                             evo_params.deap_params['choices'],
                             evo_params.score_weights)
        invalid_ind = True
        fitted_models = {n: m for n, (_, m) in zip(pop_names, models)}
        ngen = evo_params.deap_params['control'].get('ngen') or None
        if not ngen and not evo_params.early_stop:
            raise ValueError('param_grids: pg_name: control: has neither '
                             'ngen or early_stop keys')
        elif not ngen:
            ngen = 1000000
        for gen in range(ngen):
            # on last generation invalid_ind becomes None
            # and breaks this loop
            if models_share_sample:
                sample_keys_passed = (gen_to_sample_key(gen %
                                                        len(sample_keys)), )
            else:
                sample_keys_passed = sample_keys

            if gen > 0:
                log_once(len(invalid_ind), sample_keys_passed, gen)
                names = [ind.name for ind in invalid_ind]
                models, fitnesses = fit_one_generation(dsk, gen,
                                                       sample_keys_passed,
                                                       invalid_ind)
                fitted_models.update(
                    {n: m
                     for n, (_, m) in zip(names, models)})
            (pop, invalid_ind, param_history) = ea_gen.send(fitnesses)
            pop_names = [ind.name for ind in pop]
            fitted_models = {
                k: v
                for k, v in fitted_models.items() if k in pop_names
            }
            if not invalid_ind:
                break  # If there are no new solutions to try, break
        pop = evo_params.toolbox.select(pop, saved_ensemble_size)
        pop_names = [ind.name for ind in pop]
        models = [(k, v) for k, v in fitted_models.items() if k in pop_names]

    finally:
        columns = list(evo_params.deap_params['param_order'])
        columns += [
            'objective_{}_{}'.format(idx, 'min' if sw == -1 else 'max')
            for idx, sw in enumerate(evo_params.score_weights)
        ]
        if param_history:
            assert len(columns) == len(param_history[0])
            param_history = pd.DataFrame(np.array(param_history),
                                         columns=columns)
            param_history.to_csv(evo_params.history_file,
                                 index_label='parameter_set')
    return models
Exemplo n.º 9
0
def final_on_sample_step(fitter,
                         model,
                         X,
                         fit_kwargs,
                         y=None,
                         sample_weight=None,
                         require_flat=True,
                         prepare_for='train'):
    '''This is the final transformation before the last estimator
    in a Pipeline is called.  It takes the numpy array for X
    needed by the estimator from X as an ElmStore

    Parameters:
        :fitter: fit function object
        :model:  the final estimator in a Pipeline
        :X:      ElmStore with DataArray "flat"
        :fit_kwargs: kwargs to fitter
        :y:      numpy array y if needed
        :sample_weight: numpy array if needed
        :require_flat: raise an error if the ElmStore has no "flat" band
        :prepare_for:  determines whether y is included in fit_args

    Returns
        :args, kwargs: that fitter should use

    '''
    fit_kwargs = copy.deepcopy(fit_kwargs or {})
    if y is None:
        y = fit_kwargs.pop('y', None)
    else:
        fit_kwargs.pop('y', None)
    if sample_weight is None:
        sample_weight = fit_kwargs.pop('sample_weight', None)
    else:
        fit_kwargs.pop('sample_weight', None)
    if isinstance(X, np.ndarray):
        X_values = X  # numpy array 2-d
    elif isinstance(X, (ElmStore, xr.Dataset)):
        if hasattr(X, 'flat'):
            X_values = X.flat.values
        else:
            logger.info(
                "After running Pipeline, X is not an ElmStore with a DataArray called 'flat' and X is not a numpy array.  Found {}"
                .format(type(X)))
            logger.info(
                "Trying elm.readers.reshape:flatten on X. If this fails, try a elm.pipeline.steps:ModifySample step to create ElmStore with 'flat' DataArray"
            )
            X = _flatten(X)
            X_values = X.flat.values
    else:
        X_values = X  # may not be okay for sklearn models,e.g KMEans but can be passed thru Pipeline
    if X_values.ndim == 1:
        X_values = X_values.reshape(-1, 1)
    args, kwargs, var_keyword = get_args_kwargs_defaults(fitter)

    has_y = _has_arg(y)
    has_sw = _has_arg(sample_weight)
    if has_sw:
        fit_kwargs['sample_weight'] = sample_weight
    if 'check_input' in kwargs:
        fit_kwargs['check_input'] = True
    if has_y:
        if prepare_for == 'train':
            fit_args = (X_values, y)
        else:
            fit_args = (X, )
        logger.debug('X (shape {}) and y (shape {})'.format(
            X_values.shape, y.shape))
    else:
        if prepare_for == 'train':
            fit_args = (X_values, )
        else:
            fit_args = (X, )
        logger.debug('X (shape {})'.format(X_values.shape))
    check_array(X_values, "final_on_sample_step - X")
    if has_y:
        if not y.size == X_values.shape[0]:
            raise ValueError(
                "Bad size for y ({}) - does not match X.shape[0] ({})".format(
                    y.size, X_values.shape[0]))
    if has_sw:
        if not sample_weight.size == X_values.shape[0]:
            raise ValueError(
                "Bad size for sample_weight ({}) - does not match X.shape[0] ({})"
                .format(sample_weight.size, X_values.shape[0]))
    if 'batch_size' in model.get_params():
        logger.debug('set batch_size {}'.format(X_values.shape[0]))
        model.set_params(batch_size=X_values.shape[0])
    return fit_args, fit_kwargs