Python expand_features_encoded 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: fklearn.training.utils

메소드/함수: expand_features_encoded

hotexamples.com에서의 예제들: 9

Python expand_features_encoded - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 fklearn.training.utils.expand_features_encoded에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_expand_features_encoded():
    df_A = pd.DataFrame({
        'a': ["id1", "id2", "id3", "id4"],
        'b': [10.0, 13.0, 100.0, 13.0],
        'c': [0, 1, 100, 0],
        'd': [2, 1, 2, 0.5],
        'e': [0, 1, 0, 1]
    })

    df_B = pd.DataFrame({
        'fklearn_feat__a==1': ["id1", "id2", "id3", "id4"],
        'fklearn_feat__a==2': [10.0, 13.0, 100.0, 13.0],
        'fklearn_feat__a==nan': [0, 1, 100, 0],
        'b': [2, 1, 2, 0.5],
        'c': [0, 1, 0, 1]
    })

    df_C = pd.DataFrame({
        'fklearn_feat__a==1': ["id1", "id2", "id3", "id4"],
        'fklearn_feat__a==2': [10.0, 13.0, 100.0, 13.0],
        'fklearn_feat__a==nan': [0, 1, 100, 0],
        'b': [2, 1, 2, 0.5],
        'fklearn_feat__c==10': [0, 1, 0, 1]
    })

    features_all = ["a", "b", "c", "d", "e"]
    features_partial = ["a", "b", "c"]
    features_partialler = ["a", "b"]

    transformed_1 = expand_features_encoded(df_A, features_all)
    expected_1 = ["a", "b", "c", "d", "e"]

    transformed_2 = expand_features_encoded(df_A, features_partial)
    expected_2 = ["a", "b", "c"]

    transformed_3 = expand_features_encoded(df_B, features_partial)
    expected_3 = [
        "fklearn_feat__a==1", "fklearn_feat__a==2", "fklearn_feat__a==nan",
        "b", "c"
    ]

    transformed_4 = expand_features_encoded(df_C, features_partial)
    expected_4 = [
        "fklearn_feat__a==1", "fklearn_feat__a==2", "fklearn_feat__a==nan",
        "b", "fklearn_feat__c==10"
    ]

    transformed_5 = expand_features_encoded(df_C, features_partialler)
    expected_5 = [
        "fklearn_feat__a==1", "fklearn_feat__a==2", "fklearn_feat__a==nan", "b"
    ]

    assert Counter(transformed_1) == Counter(expected_1)
    assert Counter(transformed_2) == Counter(expected_2)
    assert Counter(transformed_3) == Counter(expected_3)
    assert Counter(transformed_4) == Counter(expected_4)
    assert Counter(transformed_5) == Counter(expected_5)

예제 #2

파일 보기

def non_parametric_double_ml_learner(df: pd.DataFrame,
                                     feature_columns: List[str],
                                     treatment_column: str,
                                     outcome_column: str,
                                     debias_model: Union[RegressorMixin, None] = None,
                                     debias_feature_columns: List[str] = None,
                                     denoise_model: Union[RegressorMixin, None] = None,
                                     denoise_feature_columns: List[str] = None,
                                     final_model: Union[RegressorMixin, None] = None,
                                     final_model_feature_columns: List[str] = None,
                                     prediction_column: str = "prediction",
                                     cv_splits: int = 2,
                                     encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an Non-Parametric Double/ML Meta Learner for Conditional Average Treatment Effect Estimation. It implements the
    following steps:
    1) fits k instances of the debias model to predict the treatment from the features and get out-of-fold residuals
        t_res=t-t_hat;
    2) fits k instances of the denoise model to predict the outcome from the features and get out-of-fold residuals
        y_res=y-y_hat;
    3) fits a final ML model to predict y_res / t_res from the features using weighted regression with weights set to
        t_res^2. Trained like this, the final model will output treatment effect predictions.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features, treatment and target columns.
        The model will be trained to predict the target column
        from the features.

    feature_columns : list of str
        A list os column names that are used as features for the denoise, debias and final models in double-ml. All
         this names should be in `df`.

    treatment_column : str
        The name of the column in `df` that should be used as treatment for the double-ml model. It will learn the
         impact of this column with respect to the outcome column.

    outcome_column : str
        The name of the column in `df` that should be used as outcome for the double-ml model. It will learn the impact
        of the treatment column on this outcome column.

    debias_model : RegressorMixin (default None)
        The estimator for fitting the treatment from the features. Must implement fit and predict methods. It can be an
        scikit-learn regressor. When None, defaults to GradientBoostingRegressor.

    debias_feature_columns : list of str (default None)
        A list os column names to be used only for the debias model. If not None, it will replace feature_columns when
        fitting the debias model.

    denoise_model : RegressorMixin (default None)
        The estimator for fitting the outcome from the features. Must implement fit and predict methods. It can be an
        scikit-learn regressor. When None, defaults to GradientBoostingRegressor.

    denoise_feature_columns : list of str (default None)
        A list os column names to be used only for the denoise model. If not None, it will replace feature_columns when
        fitting the denoise model.

    final_model : RegressorMixin (default None)
        The estimator for fitting the outcome residuals from the treatment residuals. Must implement fit and predict
        methods. It can be an arbitrary scikit-learn regressor. The fit method must accept sample_weight as a keyword
        argument. When None, defaults to GradientBoostingRegressor.

    final_model_feature_columns : list of str (default None)
        A list os column names to be used only for the final model. If not None, it will replace feature_columns when
        fitting the final model.

    prediction_column : str (default "prediction")
        The name of the column with the treatment effect predictions from the final model.

    cv_splits : int (default 2)
        Number of folds to split the training data when fitting the debias and denoise models

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    features = feature_columns if not encode_extra_cols else expand_features_encoded(df, feature_columns)

    debias_model = GradientBoostingRegressor() if not debias_model else debias_model
    denoise_model = GradientBoostingRegressor() if not denoise_model else denoise_model
    final_model = GradientBoostingRegressor() if not final_model else final_model

    t_hat, mts = _cv_estimate(debias_model, df,
                              features if debias_feature_columns is None else debias_feature_columns,
                              treatment_column, cv_splits)
    y_hat, mys = _cv_estimate(denoise_model, df,
                              features if denoise_feature_columns is None else denoise_feature_columns,
                              outcome_column, cv_splits)

    y_res = df[outcome_column] - y_hat
    t_res = df[treatment_column] - t_hat

    final_target = y_res / t_res
    weights = t_res ** 2
    final_model_x = features if final_model_feature_columns is None else final_model_feature_columns

    model_final_fitted = final_model.fit(X=df[final_model_x],
                                         y=final_target,
                                         sample_weight=weights)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(**{prediction_column: model_final_fitted.predict(new_df[final_model_x].values)})

    p.__doc__ = learner_pred_fn_docstring("non_parametric_double_ml_learner")

    log = {'non_parametric_double_ml_learner': {
        'features': feature_columns,
        'debias_feature_columns': debias_feature_columns,
        'denoise_feature_columns': denoise_feature_columns,
        'final_model_feature_columns': final_model_feature_columns,
        'outcome_column': outcome_column,
        'treatment_column': treatment_column,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sk_version,
        'feature_importance': dict(zip(features, model_final_fitted.feature_importances_)),
        'training_samples': len(df)},
        'debias_models': mts,
        'denoise_models': mys,
        'cv_splits': cv_splits,
        'object': model_final_fitted}

    return p, p(df), log

예제 #3

파일 보기

파일: classification.py 프로젝트: vitorsrg/fklearn

def lgbm_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an LGBM classifier to the dataset.

    It first generates a Dataset
    with the specified features and labels from `df`. Then, it fits a LGBM
    model to this Dataset. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
       A pandas DataFrame with features and target columns.
       The model will be trained to predict the target column
       from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be discrete, since this is a classification model.

    learning_rate : float
        Float in the range (0, 1]
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the learning_rate hyper-parameter in:
        https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

    num_estimators : int
        Int in the range (0, inf)
        Number of boosted trees to fit.
        See the num_iterations hyper-parameter in:
        https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value}.
        Other parameters for the LGBM model. See the list in:
        https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    import lightgbm as lgbm

    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)
    params = params if "objective" in params else assoc(
        params, "objective", 'binary')

    weights = df[weight_column].values if weight_column else None

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    dtrain = lgbm.Dataset(df[features].values,
                          label=df[target],
                          feature_name=list(map(str, features)),
                          weight=weights,
                          silent=True)

    bst = lgbm.train(params, dtrain, num_estimators)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
        if params["objective"] == "multiclass":
            col_dict = {
                prediction_column + "_" + str(key): value
                for (key, value
                     ) in enumerate(bst.predict(new_df[features].values).T)
            }
        else:
            col_dict = {
                prediction_column: bst.predict(new_df[features].values)
            }

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(bst)
            shap_values = explainer.shap_values(new_df[features])
            shap_expected_value = explainer.expected_value

            if params["objective"] == "multiclass":
                shap_values_multiclass = {
                    f"shap_values_{class_index}": list(value)
                    for (class_index, value) in enumerate(shap_values)
                }
                shap_expected_value_multiclass = {
                    f"shap_expected_value_{class_index}":
                    np.repeat(expected_value, len(class_shap_values))
                    for (class_index, (expected_value, class_shap_values)
                         ) in enumerate(zip(shap_expected_value, shap_values))
                }
                shap_output = merge(shap_values_multiclass,
                                    shap_expected_value_multiclass)

            else:
                shap_values = list(shap_values[1])
                shap_output = {
                    "shap_values":
                    shap_values,
                    "shap_expected_value":
                    np.repeat(shap_expected_value[1], len(shap_values))
                }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("lgbm_classification_learner",
                                          shap=True)

    log = {
        'lgbm_classification_learner': {
            'features':
            features,
            'target':
            target,
            'prediction_column':
            prediction_column,
            'package':
            "lightgbm",
            'package_version':
            lgbm.__version__,
            'parameters':
            assoc(params, "num_estimators", num_estimators),
            'feature_importance':
            dict(zip(features,
                     bst.feature_importance().tolist())),
            'training_samples':
            len(df)
        },
        'object': bst
    }

    return p, p(df), log

예제 #4

파일 보기

파일: classification.py 프로젝트: vitorsrg/fklearn

def catboost_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an CatBoost classifier to the dataset. It first generates a DMatrix
    with the specified features and labels from `df`. Then, it fits a CatBoost
    model to this DMatrix. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be discrete, since this is a classification model.

    learning_rate : float
        Float in the range (0, 1]
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the eta hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    num_estimators : int
        Int in the range (0, inf)
        Number of boosted trees to fit.
        See the n_estimators hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value}.
        Other parameters for the CatBoost model. See the list in:
        https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.
        If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes).

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """
    from catboost import Pool, CatBoostClassifier
    import catboost

    weights = df[weight_column].values if weight_column else None
    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)
    params = params if "objective" in params else assoc(
        params, "objective", 'Logloss')

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    cat_features = params["cat_features"] if "cat_features" in params else None

    dtrain = Pool(df[features].values,
                  df[target].values,
                  weight=weights,
                  feature_names=list(map(str, features)),
                  cat_features=cat_features)

    cat_boost_classifier = CatBoostClassifier(iterations=num_estimators,
                                              **params)
    cbr = cat_boost_classifier.fit(dtrain, verbose=0)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

        pred = cbr.predict_proba(new_df[features])
        if params["objective"] == "MultiClass":
            col_dict = {
                prediction_column + "_" + str(key): value
                for (key, value) in enumerate(pred.T)
            }
            col_dict.update({prediction_column: pred.argmax(axis=1)})
        else:
            col_dict = {prediction_column: pred[:, 1]}

        if apply_shap:
            import shap
            if params["objective"] == "MultiClass":
                shap_values = _get_catboost_shap_values(
                    df, cbr, features, target, weights, cat_features)
                # catboost shap returns a list for each row, we reformat it to return
                # a list for each class
                shap_values = shap_values.transpose(1, 0, 2)
                shap_values_multiclass = {
                    f"shap_values_{class_index}": list(value[:, :-1])
                    for (class_index, value) in enumerate(shap_values)
                }
                shap_expected_value_multiclass = {
                    f"shap_expected_value_{class_index}": value[:, -1]
                    for (class_index, value) in enumerate(shap_values)
                }
                shap_output = merge(shap_values_multiclass,
                                    shap_expected_value_multiclass)

            else:
                explainer = shap.TreeExplainer(cbr)
                shap_values = explainer.shap_values(new_df[features])
                shap_expected_value = explainer.expected_value
                shap_values = list(shap_values)
                shap_output = {
                    "shap_values":
                    shap_values,
                    "shap_expected_value":
                    np.repeat(shap_expected_value, len(shap_values))
                }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner",
                                          shap=True)

    log = {
        'catboost_classification_learner': {
            'features': features,
            'target': target,
            'prediction_column': prediction_column,
            'package': "catboost",
            'package_version': catboost.__version__,
            'parameters': assoc(params, "num_estimators", num_estimators),
            'feature_importance': cbr.feature_importances_,
            'training_samples': len(df)
        },
        'object': cbr
    }

    return p, p(df), log

예제 #5

파일 보기

파일: classification.py 프로젝트: vitorsrg/fklearn

def logistic_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an logistic regression classifier to the dataset. Return the predict function
    for the model and the predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be discrete, since this is a classification model.

    params : dict
        The LogisticRegression parameters in the format {"par_name": param}. See:
        http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

    prediction_column : str
        The name of the column with the predictions from the model.
        If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes).

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    def_params = {"C": 0.1, "multi_class": "ovr", "solver": "liblinear"}
    merged_params = def_params if not params else merge(def_params, params)

    weights = df[weight_column].values if weight_column else None

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    clf = LogisticRegression(**merged_params)
    clf.fit(df[features].values, df[target].values, sample_weight=weights)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        pred = clf.predict_proba(new_df[features].values)
        if merged_params["multi_class"] == "multinomial":
            col_dict = {
                prediction_column + "_" + str(key): value
                for (key, value) in enumerate(pred.T)
            }
            col_dict.update({prediction_column: pred.argmax(axis=1)})
        else:
            col_dict = {prediction_column: pred[:, 1]}

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner")

    log = {
        'logistic_classification_learner': {
            'features': features,
            'target': target,
            'parameters': merged_params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sk_version,
            'feature_importance': dict(zip(features, clf.coef_.flatten())),
            'training_samples': len(df)
        },
        'object': clf
    }

    return p, p(df), log

예제 #6

파일 보기

파일: classification.py 프로젝트: vitorsrg/fklearn

def xgb_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an XGBoost classifier to the dataset. It first generates a DMatrix
    with the specified features and labels from `df`. Then, it fits a XGBoost
    model to this DMatrix. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be discrete, since this is a classification model.

    learning_rate : float
        Float in the range (0, 1]
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the eta hyper-parameter in:
        http://xgboost.readthedocs.io/en/latest/parameter.html

    num_estimators : int
        Int in the range (0, inf)
        Number of boosted trees to fit.
        See the n_estimators hyper-parameter in:
        http://xgboost.readthedocs.io/en/latest/python/python_api.html

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value}.
        Other parameters for the XGBoost model. See the list in:
        http://xgboost.readthedocs.io/en/latest/parameter.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.
        If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes).

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    import xgboost as xgb

    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)
    params = params if "objective" in params else assoc(
        params, "objective", 'binary:logistic')

    weights = df[weight_column].values if weight_column else None

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    dtrain = xgb.DMatrix(df[features].values,
                         label=df[target].values,
                         feature_names=map(str, features),
                         weight=weights)

    bst = xgb.train(params, dtrain, num_estimators)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

        dtest = xgb.DMatrix(new_df[features].values,
                            feature_names=map(str, features))

        pred = bst.predict(dtest)
        if params["objective"] == "multi:softprob":
            col_dict = {
                prediction_column + "_" + str(key): value
                for (key, value) in enumerate(pred.T)
            }
            col_dict.update({prediction_column: pred.argmax(axis=1)})
        else:
            col_dict = {prediction_column: pred}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(bst)
            shap_values = explainer.shap_values(new_df[features])
            shap_expected_value = explainer.expected_value

            if params["objective"] == "multi:softprob":
                shap_values_multiclass = {
                    f"shap_values_{class_index}": list(value)
                    for (class_index, value) in enumerate(shap_values)
                }
                shap_expected_value_multiclass = {
                    f"shap_expected_value_{class_index}":
                    np.repeat(expected_value, len(class_shap_values))
                    for (class_index, (expected_value, class_shap_values)
                         ) in enumerate(zip(shap_expected_value, shap_values))
                }
                shap_output = merge(shap_values_multiclass,
                                    shap_expected_value_multiclass)

            else:
                shap_values = list(shap_values)
                shap_output = {
                    "shap_values":
                    shap_values,
                    "shap_expected_value":
                    np.repeat(shap_expected_value, len(shap_values))
                }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner",
                                          shap=True)

    log = {
        'xgb_classification_learner': {
            'features': features,
            'target': target,
            'prediction_column': prediction_column,
            'package': "xgboost",
            'package_version': xgb.__version__,
            'parameters': assoc(params, "num_estimators", num_estimators),
            'feature_importance': bst.get_score(),
            'training_samples': len(df)
        },
        'object': bst
    }

    return p, p(df), log

예제 #7

파일 보기

파일: unsupervised.py 프로젝트: vitorsrg/fklearn

def isolation_forest_learner(
        df: pd.DataFrame,
        features: List[str],
        params: Dict[str, Any] = None,
        prediction_column: str = "prediction",
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an anomaly detection algorithm (Isolation Forest) to the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    params : dict
        The IsolationForest parameters in the format {"par_name": param}. See:
        http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

    prediction_column : str
        The name of the column with the predictions from the model.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    default_params = {
        "n_jobs": -1,
        "random_state": 1729,
        "contamination": 0.1,
        "behaviour": "new"
    }
    params = default_params if not params else merge(default_params, params)

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    model = IsolationForest()
    model.set_params(**params)
    model.fit(df[features].values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        output_col = {
            prediction_column: model.decision_function(new_df[features])
        }

        return new_df.assign(**output_col)

    p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")

    log = {
        'isolation_forest_learner': {
            'features': features,
            'parameters': params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sklearn.__version__,
            'training_samples': len(df)
        }
    }

    return p, p(df), log

예제 #8

파일 보기

파일: regression.py 프로젝트: vitorsrg/fklearn

def gp_regression_learner(df: pd.DataFrame,
                          features: List[str],
                          target: str,
                          kernel: kernels.Kernel = None,
                          alpha: float = 0.1,
                          extra_variance: Union[str, float] = "fit",
                          return_std: bool = False,
                          extra_params: Dict[str, Any] = None,
                          prediction_column: str = "prediction",
                          encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an gaussian process regressor to the dataset.

    Parameters
    ----------

    df: pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features: list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target: str
        The name of the column in `df` that should be used as target for the model.
        This column should be numerical and continuous, since this is a regression model.

    kernel: sklearn.gaussian_process.kernels
        The kernel specifying the covariance function of the GP. If None is passed,
        the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters
        are optimized during fitting.

    alpha: float
        Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased
        noise level in the observations. This can also prevent a potential numerical issue during fitting,
        by ensuring that the calculated values form a positive definite matrix.

    extra_variance: float
        The amount of extra variance to scale to the predictions in standard deviations. If left as the default "fit",
        Uses the standard deviation of the target.

    return_std: bool
        If True, the standard-deviation of the predictive distribution at the query points is returned
        along with the mean.

    extra_params: dict {"hyperparameter_name" : hyperparameter_value}, optional
        Other parameters for the GaussianProcessRegressor model. See the list in:
        http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    params = extra_params if extra_params else {}

    params['alpha'] = alpha
    params['kernel'] = kernel

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    gp = GaussianProcessRegressor(**params)
    gp.fit(df[features], df[target])

    extra_variance = df[target].std(
    ) if extra_variance == "fit" else extra_variance if extra_variance else 1

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        if return_std:
            pred_mean, pred_std = gp.predict(df[features], return_std=True)
            pred_std *= extra_variance
            return new_df.assign(**{
                prediction_column: pred_mean,
                prediction_column + "_std": pred_std
            })
        else:
            return new_df.assign(
                **{prediction_column: gp.predict(df[features])})

    p.__doc__ = learner_pred_fn_docstring("gp_regression_learner")

    log = {
        'gp_regression_learner': {
            'features':
            features,
            'target':
            target,
            'parameters':
            merge(params, {
                'extra_variance': extra_variance,
                'return_std': return_std
            }),
            'prediction_column':
            prediction_column,
            'package':
            "sklearn",
            'package_version':
            sk_version,
            'training_samples':
            len(df)
        },
        'object': gp
    }

    return p, p(df), log

예제 #9

파일 보기

파일: regression.py 프로젝트: vitorsrg/fklearn

def linear_regression_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        params: Dict[str, Any] = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an linear regression classifier to the dataset. Return the predict function
    for the model and the predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be continuous, since this is a regression model.

    params : dict
        The LinearRegression parameters in the format {"par_name": param}. See:
        http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

    prediction_column : str
        The name of the column with the predictions from the model.

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    def_params = {"fit_intercept": True}
    params = def_params if not params else merge(def_params, params)

    weights = df[weight_column].values if weight_column else None

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    regr = LinearRegression(**params)
    regr.fit(df[features].values, df[target].values, sample_weight=weights)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(
            **{prediction_column: regr.predict(new_df[features].values)})

    p.__doc__ = learner_pred_fn_docstring("linear_regression_learner")

    log = {
        'linear_regression_learner': {
            'features': features,
            'target': target,
            'parameters': params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sk_version,
            'feature_importance': dict(zip(features, regr.coef_.flatten())),
            'training_samples': len(df)
        },
        'object': regr
    }

    return p, p(df), log