示例#1
0
文件: utils.py 项目: jonppe/pycaret
def save_config(file_name: str, globals_d: dict):
    """
    This function is used to save all enviroment variables to file,
    allowing to later resume modeling without rerunning setup().

    Example
    -------
    >>> save_config('myvars.pkl') 

    This will save all enviroment variables to 'myvars.pkl'.

    """

    function_params_str = ", ".join(
        [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"])

    logger = get_logger()

    logger.info("Initializing save_config()")
    logger.info(f"save_config({function_params_str})")

    globals_to_dump = {
        k: v
        for k, v in globals_d.items() if k in globals_d["pycaret_globals"]
    }

    import joblib

    joblib.dump(globals_to_dump, file_name)

    logger.info(f"Global variables dumped to {file_name}")
    logger.info(
        "save_config() succesfully completed......................................"
    )
示例#2
0
 def __init__(
     self,
     estimator,
     X_train: pd.DataFrame,
     y_train: pd.DataFrame,
     groups=None,
     **fit_kwargs,
 ):
     logger = get_logger()
     self.estimator = deepcopy(estimator)
     if not is_fitted(self.estimator):
         try:
             self.estimator._carry_over_final_estimator_fit_vars()
         except:
             pass
         if not is_fitted(self.estimator):
             logger.info(
                 f"fit_if_not_fitted: {estimator} is not fitted, fitting")
             try:
                 self.estimator.fit(X_train,
                                    y_train,
                                    groups=groups,
                                    **fit_kwargs)
             except:
                 self.estimator.fit(X_train, y_train, **fit_kwargs)
示例#3
0
def can_early_stop(
    estimator,
    consider_partial_fit,
    consider_warm_start,
    consider_xgboost,
    params,
):
    """
    From https://github.com/ray-project/tune-sklearn/blob/master/tune_sklearn/tune_basesearch.py.
    
    Helper method to determine if it is possible to do early stopping.
    Only sklearn estimators with ``partial_fit`` or ``warm_start`` can be early
    stopped. warm_start works by picking up training from the previous
    call to ``fit``.
    
    Returns
    -------
        bool
            if the estimator can early stop
    """

    logger = get_logger()

    from sklearn.tree import BaseDecisionTree
    from sklearn.ensemble import BaseEnsemble

    try:
        base_estimator = estimator.steps[-1][1]
    except:
        base_estimator = estimator

    if consider_partial_fit:
        can_partial_fit = supports_partial_fit(base_estimator, params=params)
    else:
        can_partial_fit = False

    if consider_warm_start:
        is_not_tree_subclass = not issubclass(type(base_estimator),
                                              BaseDecisionTree)
        is_ensemble_subclass = issubclass(type(base_estimator), BaseEnsemble)
        can_warm_start = hasattr(base_estimator, "warm_start") and (
            (hasattr(base_estimator, "max_iter") and is_not_tree_subclass
             and not is_ensemble_subclass) or
            (is_ensemble_subclass and hasattr(base_estimator, "n_estimators")))
    else:
        can_warm_start = False

    if consider_xgboost:
        from xgboost.sklearn import XGBModel

        is_xgboost = isinstance(base_estimator, XGBModel)
    else:
        is_xgboost = False

    logger.info(
        f"can_partial_fit: {can_partial_fit}, can_warm_start: {can_warm_start}, is_xgboost: {is_xgboost}"
    )

    return can_partial_fit or can_warm_start or is_xgboost
示例#4
0
def load_config(file_name: str, globals_d: dict):
    """
    This function is used to load enviroment variables from file created with save_config(),
    allowing to later resume modeling without rerunning setup().


    Example
    -------
    >>> load_config('myvars.pkl') 

    This will load all enviroment variables from 'myvars.pkl'.

    """

    function_params_str = ", ".join(
        [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"]
    )

    logger = get_logger()

    logger.info("Initializing load_config()")
    logger.info(f"load_config({function_params_str})")

    import joblib

    loaded_globals = joblib.load(file_name)

    logger.info(f"Global variables loaded from {file_name}")

    for k, v in loaded_globals.items():
        globals_d[k] = v

    globals_d["logger"] = get_logger()

    logger.info(f"Global variables set to match those in {file_name}")

    logger.info(
        "load_config() succesfully completed......................................"
    )
示例#5
0
def get_groups(groups: Union[str, pd.DataFrame], X_train: pd.DataFrame,
               default: pd.DataFrame):
    logger = get_logger()
    if groups is None:
        return default
    if isinstance(groups, str):
        if groups not in X_train.columns:
            raise ValueError(
                f"Column {groups} used for groups is not present in the dataset."
            )
        groups = X_train[groups]
    else:
        if groups.shape[0] != X_train.shape[0]:
            raise ValueError(
                f"groups has lenght {groups.shape[0]} which doesn't match X_train length of {len(X_train)}."
            )
    return groups
示例#6
0
def set_config(variable: str, value, globals_d: dict):

    """
    This function is used to reset global environment variables.

    Example
    -------
    >>> set_config('seed', 123) 

    This will set the global seed to '123'.

    """

    function_params_str = ", ".join(
        [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"]
    )

    logger = get_logger()

    logger.info("Initializing set_config()")
    logger.info(f"set_config({function_params_str})")

    if variable.startswith("_"):
        raise ValueError(f"Variable {variable} is read only ('_' prefix).")

    if not variable in globals_d["pycaret_globals"] or variable == "pycaret_globals":
        raise ValueError(
            f"Variable {variable} not found. Possible variables are: {globals_d['pycaret_globals']}"
        )

    globals_d[variable] = value

    # special case
    if not globals_d["gpu_param"] and variable == "n_jobs_param":
        globals_d["_gpu_n_jobs_param"] = value

    logger.info(f"Global variable: {variable} updated to {value}")
    logger.info(
        "set_config() succesfully completed......................................"
    )
示例#7
0
def get_config(variable: str, globals_d: dict):

    """
    This function is used to access global environment variables.

    Example
    -------
    >>> X_train = get_config('X_train') 

    This will return X_train transformed dataset.

    Returns
    -------
    variable

    """

    function_params_str = ", ".join(
        [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"]
    )

    logger = get_logger()

    logger.info("Initializing get_config()")
    logger.info(f"get_config({function_params_str})")

    if not variable in globals_d["pycaret_globals"]:
        raise ValueError(
            f"Variable {variable} not found. Possible variables are: {globals_d['pycaret_globals']}"
        )

    global_var = globals_d[variable]

    logger.info(f"Global variable: {variable} returned as {global_var}")
    logger.info(
        "get_config() succesfully completed......................................"
    )

    return global_var
示例#8
0
    def __init__(
        self,
        verbose: bool = True,
        html_param: bool = True,
        progress_args: Optional[Dict[str, Any]] = None,
        master_display_columns: Optional[List[str]] = None,
        monitor_rows: Optional[List[List[str]]] = None,
        round: int = 4,
    ):
        self.logger = get_logger()
        self.verbose = verbose
        self.html_param = html_param
        self.round = round
        try:
            self.enviroment = str(get_ipython())
            self.enviroment = "google.colab" if is_in_colab(
            ) else self.enviroment
        except:
            self.enviroment = ""

        if not self.verbose:
            return

        self.logger.info("Preparing display monitor")

        # progress bar
        if progress_args and self.verbose and self.html_param:
            progress_args = {**self.default_progress_args, **progress_args}
            self.progress = ipw.IntProgress(**progress_args)

        if master_display_columns:
            self.master_display = pd.DataFrame(columns=master_display_columns)

        if monitor_rows and self.html_param:
            self.monitor = pd.DataFrame(
                monitor_rows,
                columns=[" " * i for i in range(len(monitor_rows[0]))],
            ).set_index("")
示例#9
0
def show_yellowbrick_plot(
    visualizer,
    X_train,
    y_train,
    X_test,
    y_test,
    name: str,
    handle_train: str = "fit",
    handle_test: str = "score",
    scale: float = 1,
    save: bool = False,
    fit_kwargs: Optional[dict] = None,
    groups: Optional[Any] = None,
    display: Optional[Display] = None,
    display_format: Optional[str] = None,
    **kwargs,
):
    """
    Generic method to handle yellowbrick plots.
    """
    logger = get_logger()
    visualizer.fig.set_dpi(visualizer.fig.dpi * scale)

    if not fit_kwargs:
        fit_kwargs = {}

    fit_kwargs_and_kwargs = {**fit_kwargs, **kwargs}

    if handle_train == "draw":
        logger.info("Drawing Model")
        visualizer.draw(X_train, y_train, **kwargs)
    elif handle_train == "fit":
        logger.info("Fitting Model")
        visualizer.fit(X_train, y_train, **fit_kwargs_and_kwargs)
    elif handle_train == "fit_transform":
        logger.info("Fitting & Transforming Model")
        visualizer.fit_transform(X_train, y_train, **fit_kwargs_and_kwargs)
    elif handle_train == "score":
        logger.info("Scoring train set")
        visualizer.score(X_train, y_train, **kwargs)

    display.move_progress()

    if handle_test == "draw":
        visualizer.draw(X_test, y_test)
    elif handle_test == "fit":
        visualizer.fit(X_test, y_test, **fit_kwargs)
    elif handle_test == "fit_transform":
        visualizer.fit_transform(X_test, y_test, **fit_kwargs)
    elif handle_test == "score":
        logger.info("Scoring test/hold-out set")
        visualizer.score(X_test, y_test)

    display.move_progress()
    display.clear_output()

    if save:
        logger.info(f"Saving '{name}.png' in current active directory")
        visualizer.show(outpath=f"{name}.png", clear_figure=True)
    else:
        if display_format == "streamlit":
            show_yellowbrick_in_streamlit(visualizer, clear_figure=True)
        else:
            visualizer.show(clear_figure=True)

    logger.info("Visual Rendered Successfully")
示例#10
0
    def __create_resplots(
        self,
        model,
        x: np.ndarray,
        y: np.ndarray,
        x_test: np.ndarray = None,
        y_test: np.ndarray = None,
    ) -> widgets.VBox:
        logger = get_logger()

        with fit_if_not_fitted(model, x, y) as fitted_model:
            fitted = fitted_model.predict(x)
            fitted_residuals = fitted - y

            if x_test is not None and y_test is not None:
                pred = fitted_model.predict(x_test)
                prediction_residuals = pred - y_test

                predictions = np.concatenate((fitted, pred))
                residuals = np.concatenate((fitted_residuals, prediction_residuals))
                split_origin = np.concatenate(
                    (np.repeat("train", fitted.shape[0]), np.repeat("test", pred.shape[0]))
                )

                x = np.concatenate((x, x_test))
                y = np.concatenate((y, y_test))

            else:
                predictions = fitted
                residuals = fitted_residuals
                split_origin = None

        logger.info("Calculated model residuals")
        self.display.move_progress()

        tukey_anscombe_widget = TukeyAnscombeWidget(
            predictions, residuals, split_origin=split_origin
        )
        logger.info("Calculated Tunkey-Anscombe Plot")
        self.figures.append(tukey_anscombe_widget)
        self.display.move_progress()

        qq_plot_widget = QQPlotWidget(
            predictions, y, split_origin=split_origin, featuresize=x.shape[1]
        )
        logger.info("Calculated Normal QQ Plot")
        self.figures.append(qq_plot_widget)
        self.display.move_progress()

        standardized_residuals = helper.calculate_standardized_residual(
            predictions, y, None
        )
        model_norm_residuals_abs_sqrt = np.sqrt(np.abs(standardized_residuals))
        scale_location_widget = ScaleLocationWidget(
            predictions, model_norm_residuals_abs_sqrt, split_origin=split_origin
        )
        logger.info("Calculated Scale-Location Plot")
        self.figures.append(scale_location_widget)
        self.display.move_progress()

        leverage = helper.leverage_statistic(np.array(x))

        n_model_params = len(model.get_params())
        distance = helper.cooks_distance(
            standardized_residuals, leverage, n_model_params=n_model_params
        )
        cooks_distance_widget = CooksDistanceWidget(
            leverage,
            distance,
            standardized_residuals,
            n_model_params,
            split_origin=split_origin,
        )
        logger.info("Calculated Residual vs Leverage Plot inc. Cook's distance")
        self.figures.append(cooks_distance_widget)
        self.display.move_progress()

        items_layout = Layout(width="1000px")
        h0 = widgets.HBox(self.figures[:2], layout=items_layout)
        h1 = widgets.HBox(self.figures[2:], layout=items_layout)
        return widgets.VBox([h0, h1])