Пример #1
0
def make_parameter_grid(models,
                        header: str = "model") -> Union[Dict, List[Dict]]:
    """Generates a sklearn-compatible parameter grid to feed into GridSearchCV.

    Parameters
    ----------
    models : list/tuple/dict
        models can be one of:
            tuple: list of model names, uses default parameters
            dict: key (model name), value tuple/list (parameter names) /
                dict: key (parameter name), value (list of values)
        Accepts shorthand versions of model names
    header : str, default='model'
        The name of the start pipeline

    Returns
    -------
    grid : list of dict
        The parameter grid.
    """
    if isinstance(models, (list, tuple)):
        _p = [{
            header: [find_sklearn_model(model)[0]],
            header + "__" + _get_default_param_name(model):
            broadsort(_get_default_params(model)),
        } for model in models]
        return _p
    elif isinstance(models, dict):

        def _handle_single_model(name, _val):
            if isinstance(_val, (list, tuple)):
                # if the values are list/tuple, they are parameter names, use defaults
                _p = {
                    header + "__" + _v:
                    broadsort(_get_default_params(name, _v))
                    for _v in _val
                }
                _p[header] = listify(find_sklearn_model(name)[0])
                return _p
            elif isinstance(_val, dict):
                _p = {
                    header + "__" + k: broadsort(list(v))
                    for k, v in _val.items()
                }
                _p[header] = listify(find_sklearn_model(name)[0])
                return _p

        arg = [
            _handle_single_model(model_name, val)
            for model_name, val in models.items()
        ]
        if len(arg) == 1:
            return arg[0]
        else:
            return arg
    else:
        raise TypeError(
            "input type for 'models' {} is not recognized; choose from [list, tuple, dict]"
        ).format(type(models))
Пример #2
0
 def _handle_single_model(name, _val):
     if isinstance(_val, (list, tuple)):
         # if the values are list/tuple, they are parameter names, use defaults
         _p = {
             header + "__" + _v:
             broadsort(_get_default_params(name, _v))
             for _v in _val
         }
         _p[header] = listify(find_sklearn_model(name)[0])
         return _p
     elif isinstance(_val, dict):
         _p = {
             header + "__" + k: broadsort(list(v))
             for k, v in _val.items()
         }
         _p[header] = listify(find_sklearn_model(name)[0])
         return _p
Пример #3
0
def get_best_model(cv_results, minimize: bool = True):
    """Returns the best model (with correct params) given the cv_results from a `fit_grid` call.

    The idea behind this function is to fetch from the pool of models the best model
    which could be fed directly into `fit_basic` to get the detailed plots.

    Parameters
    ----------
    cv_results : MetaPanda
        A dataframe result from `.ml.fit.grid`
    minimize : bool
        Determines whether the scoring function is minimized or maximized

    Returns
    -------
    M : sklearn model
        A parameterized sklearn model (unfitted).

    Notes
    -----
    The returned model is not fitted, you will need to do this yourself.

    See Also
    --------
    fit_basic : Performs a rudimentary fit model with no parameter searching
    """
    if minimize:
        select = cv_results.df_["mean_test_score"].idxmin()
    else:
        select = cv_results.df_["mean_test_score"].idxmax()

    M = cv_results.df_.loc[select, "model"]
    # instantiate a model from text M
    inst_M = find_sklearn_model(M)[0]
    # get dict params
    param_columns = pattern("param_model__",
                            cv_results.df_.loc[select].dropna().index, False)
    # preprocess dict params to eliminate the header for sklearn models
    _old_params = cv_results.df_.loc[select, param_columns]
    _old_params.index = _old_params.index.str.rsplit("__", 1).str[-1]
    params = _old_params.to_dict()
    # iterate through parameters and cast down potential floats to ints
    for k, v in params.items():
        if isinstance(v, float):
            if v.is_integer():
                params[k] = int(v)

    # set parameters in to the model.
    inst_M.set_params(**params)
    return inst_M
Пример #4
0
def learning(
    df: "MetaPanda",
    y: str,
    x: Optional[SelectorType] = None,
    train_n: Optional[np.ndarray] = None,
    permute_n: int = 0,
    cv: Tuple[int, int] = (5, 15),
    model: str = "LinearRegression",
    cache: Optional[str] = None,
    plot: bool = False,
    verbose: int = 0,
    **model_kws
):
    """Fits a basic model to generate cross-validated training/test scores for different training set sizes.

    A cross-validation generator splits the whole dataset `k` times in training and test data. Subsets of the training set with
    varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed.
    Afterwards, the scores will be averaged over all `k` runs for each training subset size.

    Parameters
    ----------
    df : MetaPanda (n_samples, n_features)
        The main dataset.
    y : str
        A selected y column.
    x : list/tuple of str/selector, optional
        A list of selected column names for x or MetaPanda `selector`.
    train_n : int/array-like, with shape (n_ticks,) dtype float or int, optional
        Relative or absolute numbers of training examples that will be used to generate
        learning curve related data.
        If None: uses `linspace(.1, .9, 8)`
        If int: uses `linspace(.1, .9, n)`
    permute_n : int (default 0)
        The number of times to permute y, if > 0, then does full permutation analysis (making 4th plot)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    model : str/estimator sklearn model that implements `fit` and `predict` methods
        The name of a scikit-learn model, or the model object itself.
    cache : str, optional
        TODO: Not Implemented yet.
        If not None, stores the resulting model parts in JSON and reloads if present.
    plot : bool, optional
        If True, produces `.plot.learning_curve` inplace.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Other Parameters
    ----------------
    model_kws : dict, optional
        Keywords to pass to the sklearn model which are not parameterized.

    Returns
    -------
    results : MetaPanda (n_ticks, 8)
        The results matrix of mean and std scores
    permute_ : np.ndarray (permute_n,), optional
        The permutation scores associated with the permutation analysis

    Notes
    -----
    Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted.

    By default, `fit_learning` uses the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    `permute_n` is set to 0 by default, if you want a permutation histogram, this value must be > 0.

    See Also
    --------
    fit_basic : Performs a rudimentary fit model with no parameter searching.
    fit_grid : Performs exhaustive grid search analysis on the models selected.

    References
    ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # perform checks
    instance_check(df, pd.DataFrame, MetaPanda)
    instance_check(y, str)
    instance_check(train_n, (type(None), int, list, tuple, np.ndarray))
    instance_check(permute_n, int)
    instance_check(cv, (int, tuple))
    # instance_check(cache, (type(None), str))
    instance_check(plot, bool)
    bounds_check(verbose, 0, 4)

    # set dataset if a pandas object
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    # retrieve x columns if none
    _xcols = select_xcols(_df, x, y)
    k, repeats = cv if isinstance(cv, tuple) else cv, 1
    lm, pkg_name = find_sklearn_model(model, "regression")
    # assign keywords to lm
    lm.set_params(**model_kws)

    if train_n is None:
        train_n = np.linspace(0.1, 0.9, 8)
    elif isinstance(train_n, int):
        train_n = np.linspace(0.1, 0.9, train_n)
    # ml ready
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    rep = RepeatedKFold(n_splits=k, n_repeats=repeats)
    vars_ = learning_curve(
        lm,
        _x,
        _y,
        train_sizes=train_n,
        cv=rep,
        scoring="neg_root_mean_squared_error",
        n_jobs=-2,
        verbose=verbose,
        return_times=True,
    )
    # permutation analysis if permute_n > 0
    if permute_n > 0:
        perm_score_, perm_scorez_, pval = permutation_test_score(
            lm,
            _x,
            _y,
            cv=rep,
            n_permutations=permute_n,
            scoring="neg_root_mean_squared_error",
            n_jobs=-2,
            verbose=verbose,
        )

    # outputs
    output_labels_ = ["train_score", "test_score", "fit_time", "score_time"]
    # format as df
    results = pd.DataFrame(
        # stack them together
        np.hstack(
            (
                np.stack([np.mean(vars_[i], axis=1) for i in range(1, 5)], axis=1),
                np.stack([np.std(vars_[i], axis=1) for i in range(1, 5)], axis=1),
            )
        ),
        columns=list(
            it.chain(
                map(lambda s: "mean_" + s, output_labels_),
                map(lambda s: "std_" + s, output_labels_),
            )
        ),
    )
    # add N column
    results["N"] = vars_[0]
    R = MetaPanda(results)
    if plot and permute_n > 0:
        lcurve(R, perm_scorez_)
    elif plot:
        lcurve(R)
    # return as MetaPanda
    if permute_n > 0:
        return R, perm_score_, perm_scorez_, pval
    else:
        return R
Пример #5
0
def optimize(df: "MetaPanda",
             x: SelectorType,
             y: str,
             models,
             cv: int = 5,
             verbose: int = 0):
    """Performs optimization grid analysis on the models selected.

    This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model.

    .. note:: optimization only works on *continuous* parameters with each model.

    TODO: complete `.ml.fit.optimize` function

    Parameters
    ----------
    df : MetaPanda
        The main dataset.
    x : list/tuple of str
        A list of selected column names for x or MetaPanda `selector`.
    y : str
        A selected y column.
    models : tuple/dict
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Returns
    -------
    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    By default, `optimize` tunes using the root mean squared error (RMSE).
       There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
        will arrive in a later version.

    See Also
    --------
    grid : Performs exhaustive grid search analysis on the models selected.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    References
     ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # checks
    instance_check(df, MetaPanda)
    instance_check(x, (str, list, tuple, pd.Index))
    instance_check(y, str)
    nonnegative((cv, verbose), int)
    instance_check(models, (tuple, list, dict))
    bounds_check(verbose, 0, 4)

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)
    _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y)

    # define the parameter sets
    param_sets = make_optimize_grid(models)

    for m, params in zip(models, param_sets):
        model = find_sklearn_model(m)[0]
        inits, bounds = optimize_grid_for_model(params)
        # minimize for every i element
        mins = [
            so.minimize(
                _min_cross_val_scores,
                x0=i,
                args=(_xnp, _y, model, params, cv),
                bounds=bounds,
            ) for i in inits
        ]

    pass
Пример #6
0
def basic(
    df: Union[pd.DataFrame, "MetaPanda"],
    y: str,
    x: Optional[SelectorType] = None,
    cv: Union[int, Tuple[int, int]] = 5,
    model: str = "LinearRegression",
    cache: Optional[str] = None,
    plot: bool = False,
    verbose: int = 0,
    **model_kws
):
    """Performs a rudimentary fit model with no parameter searching.

    This function helps to provide a broad overview of how successful a given model is on the
    inputs of x -> y. `cv` returns scoring and timing metrics, as well as coefficients if available, whereas
    `yp` provides predicted values for each given `y`.

    Parameters
    ----------
    df : DataFrame / MetaPanda
        The main dataset.
    y : str
        Target/dependent variable (as column)
    x : list / tuple of str, optional
        A list of selected column names for independent variables. If None uses all except `y` column
    cv : int / tuple, default=5
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    model : str / sklearn model, default="LinearRegression"
        The name of a scikit-learn model, or the model object itself.
    cache : str, optional
        If not None, stores the resulting model parts in JSON and reloads if present.
    plot : bool, default=False
        If True, produces `overview_plot` inplace.
    verbose : int, default=0
        If > 0, prints out statements depending on level.

    Other Parameters
    ----------------
    model_kws : dict, optional
        Keywords to pass to the sklearn model which are not parameterized.

    Returns
    -------
    cv : MetaPanda
        A dataframe result of cross-validated repeats. Can include w_ coefficients.
    yp : pd.Series
        The predictions for each of y

    Notes
    -----
    Shorthand names for the models, i.e `lm` for LinearRegression
        or `gauss` for a GaussianProcessRegressor, are accepted.

    By default, `fit_basic` uses the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    See Also
    --------
    fit_grid : Performs exhaustive grid search analysis on the models selected.

    References
    ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # checks
    instance_check(df, (pd.DataFrame, MetaPanda))
    instance_check(x, (type(None), str, list, tuple, pd.Index))
    instance_check(y, str)
    instance_check(cv, (int, tuple))
    instance_check(cache, (type(None), str))
    instance_check(plot, bool)
    bounds_check(verbose, 0, 4)
    assert is_sklearn_model(model), "model '{}' is not a valid sklearn model."

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)

    rep = _define_regression_kfold_object(cv)
    lm, pkg_name = find_sklearn_model(model, "regression")
    # assign keywords to lm
    lm.set_params(**model_kws)
    # make data set machine learning ready.
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    if verbose > 0:
        print(
            "full dataset: {}/{} -> ML: {}/{}({},{})".format(
                _df.n_, _df.p_, _df.shape[0], _df.shape[1], _x.shape[1], 1
            )
        )

    # function 1: performing cross-validated fit.
    def _perform_cv_fit(
        _x: np.ndarray, _columns: pd.Index, _y: np.ndarray, _rep, _lm, package_name: str
    ):
        # cv cross-validate and wrap.
        score_mat = pd.DataFrame(
            cross_validate(
                _lm,
                _x,
                _y,
                cv=_rep,
                scoring="neg_root_mean_squared_error",
                return_estimator=True,
                return_train_score=True,
                n_jobs=-2,
            )
        )
        # append results to cv
        # if repeatedkfold, add n_repeats
        if isinstance(rep, RepeatedKFold):
            score_mat["k"] = np.repeat(np.arange(rep.n_splits), rep.n_repeats)
        else:
            score_mat["k"] = np.arange(rep.n_splits)
        # extract coefficients
        coef = _extract_coefficients_from_model(score_mat, _xcols, package_name)
        # integrate coefficients
        if not isinstance(coef, (list, tuple)):
            score_mat = score_mat.join(coef.add_prefix("w__"))
        # drop estimator
        score_mat.drop("estimator", axis=1, inplace=True)
        # wrap as metapanda and return
        return MetaPanda(score_mat)

    # function 2: performing cross-validated predictions.
    def _perform_prediction_fit(
        _x: np.ndarray, _y: np.ndarray, _ind: pd.Index, _yn: str, _rep, _lm
    ) -> pd.DataFrame:
        return pd.Series(cross_val_predict(_lm, _x, _y, cv=_rep), index=_ind).to_frame(
            _yn
        )

    if cache is not None:
        cache_cv = insert_suffix(cache, "_cv")
        cache_yp = insert_suffix(cache, "_yp")
        _cv = cache_function(
            cache_cv,
            _perform_cv_fit,
            _x=_x,
            _xcols=_xcols,
            _y=_y,
            _rep=rep,
            _lm=lm,
            package_name=pkg_name,
        )
        _yp = cache_function(
            cache_yp,
            _perform_prediction_fit,
            _x=_x,
            _y=_y,
            _ind=_df.index,
            _yn=y,
            _rep=rep,
            _lm=lm,
        )
    else:
        _cv = _perform_cv_fit(_x, _xcols, _y, rep, lm, pkg_name)
        _yp = _perform_prediction_fit(_df, _x, _y, y, rep, lm)

    if plot:
        overview(_df, x, y, _cv, _yp)
    # return both.
    return _cv, _yp