model_final_fitted = final_model.fit(X=df[final_model_x], y=final_target, sample_weight=weights) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign(**{prediction_column: model_final_fitted.predict(new_df[final_model_x].values)}) p.__doc__ = learner_pred_fn_docstring("non_parametric_double_ml_learner") log = {'non_parametric_double_ml_learner': { 'features': feature_columns, 'debias_feature_columns': debias_feature_columns, 'denoise_feature_columns': denoise_feature_columns, 'final_model_feature_columns': final_model_feature_columns, 'outcome_column': outcome_column, 'treatment_column': treatment_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, model_final_fitted.feature_importances_)), 'training_samples': len(df)}, 'debias_models': mts, 'denoise_models': mys, 'cv_splits': cv_splits, 'object': model_final_fitted} return p, p(df), log non_parametric_double_ml_learner.__doc__ += learner_return_docstring("Non Parametric Double/ML")
log = { 'selector': { 'training_columns': training_columns, 'predict_columns': predict_columns, 'transformed_column': list(set(training_columns).union(predict_columns)) } } return p, df[training_columns], log selector.__doc__ += learner_return_docstring("Selector") @curry @log_learner_time(learner_name='capper') def capper(df: pd.DataFrame, columns_to_cap: List[str], precomputed_caps: Dict[str, float] = None) -> LearnerReturnType: """ Learns the maximum value for each of the `columns_to_cap` and used that as the cap for those columns. If precomputed caps are passed, the function uses that as the cap value instead of computing the maximum. Parameters ----------
'features': features, 'target': target, 'parameters': merged_params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, clf.coef_.flatten())), 'training_samples': len(df) }, 'object': clf } return p, p(df), log logistic_classification_learner.__doc__ += learner_return_docstring( "Logistic Regression") @curry @log_learner_time(learner_name='xgb_classification_learner') def xgb_classification_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """
'features': features, 'target': target, 'parameters': params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, regr.coef_.flatten())), 'training_samples': len(df) }, 'object': regr } return p, p(df), log linear_regression_learner.__doc__ += learner_return_docstring( "Linear Regression") @curry @log_learner_time(learner_name='xgb_regression_learner') def xgb_regression_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: Dict[str, Any] = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """
model = IsolationForest() model.set_params(**params) model.fit(df[features].values) def p(new_df: pd.DataFrame) -> pd.DataFrame: output_col = { prediction_column: model.decision_function(new_df[features]) } return new_df.assign(**output_col) p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner") log = { 'isolation_forest_learner': { 'features': features, 'parameters': params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df) } } return p, p(df), log isolation_forest_learner.__doc__ += learner_return_docstring( "Isolation Forest")
pred_fn = compose(*pred_fns.values()) return (pred_fn(df).assign( pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str)).assign( prediction=lambda d: d.lookup( d.index.values, d.pred_bin.values.squeeze())).rename( index=str, columns={ "prediction": prediction_column }).drop("pred_bin", axis=1)) p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner") log = { 'xgb_octopus_classification_learner': { 'features': features_by_bin, 'target': target_column, 'prediction_column': prediction_column, 'package': "xgboost", 'train_logs': train_logs, 'parameters': extra_params_by_bin, 'training_samples': len(train_set) } } return p, p(train_set), log xgb_octopus_classification_learner.__doc__ += learner_return_docstring( "Octopus XGB Classifier")
output_column : str The name of the column with the calibrated predictions from the model. """ clf = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') clf.fit(df[prediction_column], df[target_column]) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign( **{output_column: clf.predict(new_df[prediction_column])}) p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner") log = { 'isotonic_calibration_learner': { 'output_column': output_column, 'target_column': target_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df) } } return p, p(df), log isotonic_calibration_learner.__doc__ += learner_return_docstring( "Isotonic Calibration")
p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner") log = {'isotonic_calibration_learner': { 'output_column': output_column, 'target_column': target_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df)}, 'object': clf} return p, p(df), log isotonic_calibration_learner.__doc__ += learner_return_docstring("Isotonic Calibration") @curry @log_learner_time(learner_name='find_thresholds_with_same_risk') def find_thresholds_with_same_risk(df: pd.DataFrame, sensitive_factor: str, unfair_band_column: str, model_prediction_output: str, target_column: str = "target", output_column_name: str = "fair_band") -> LearnerReturnType: """ Calculate fair calibration, where for each band any sensitive factor group have the same target mean. Parameters ----------
'linear_regression_learner': { 'features': features, 'target': target, 'parameters': params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, regr.coef_.flatten())), 'training_samples': len(df) } } return p, p(df), log linear_regression_learner.__doc__ += learner_return_docstring( "Linear Regression") @curry @log_learner_time(learner_name='xgb_regression_learner') def xgb_regression_learner(df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: Dict[str, Any] = None, prediction_column: str = "prediction", weight_column: str = None) -> LearnerReturnType: """ Fits an XGBoost regressor to the dataset. It first generates a DMatrix with the specified features and labels from `df`. Then it fits a XGBoost
columns_imputable, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), 'statistics': imp.statistics_, 'placeholder_imputer_fn': fill_fn, 'placeholder_imputer_logs': fill_logs, } } return p, p(df), log imputer.__doc__ += learner_return_docstring("SimpleImputer") @curry @log_learner_time(learner_name='placeholder_imputer') def placeholder_imputer(df: pd.DataFrame, columns_to_impute: List[str], placeholder_value: Any = -999) -> LearnerReturnType: """ Fills missing values with a fixed value. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to fill missing values.