def test_transform_target_regressor_2d_transformer(X, y): # Check consistency with transformer accepting only 2D array and a 1D/2D y # array. transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform if y.ndim == 1: # create a 2D array and squeeze results y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() else: y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) if y.ndim == 1: # create a 2D array and squeeze results lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) else: lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_1d_transformer(X, y): # All transformer in scikit-learn expect 2D data. FunctionTransformer with # validate=False lift this constraint without checking that the input is a # 2D vector. We check the consistency of the data shape using a 1D and 2D y # array. transformer = FunctionTransformer(func=lambda x: x + 1, inverse_func=lambda x: x - 1, validate=False) regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_shifted_by_one(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_count_fit(check_inverse): # regression test for gh-issue #11618 # check that we only call a single time fit for the transformer X, y = friedman ttr = TransformedTargetRegressor( transformer=DummyTransformer(), check_inverse=check_inverse ) ttr.fit(X, y) assert ttr.transformer_.fit_counter == 1
def test_transform_target_regressor_ensure_y_array(): # check that the target ``y`` passed to the transformer will always be a # numpy array. Similarly, if ``X`` is passed as a list, we check that the # predictor receive as it is. X, y = friedman tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(), regressor=DummyCheckerListRegressor(), check_inverse=False) tt.fit(X.tolist(), y.tolist()) tt.predict(X.tolist()) assert_raises(AssertionError, tt.fit, X, y.tolist()) assert_raises(AssertionError, tt.predict, X)
def test_transform_target_regressor_invertible(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log, check_inverse=True) assert_warns_message(UserWarning, "The provided functions or transformer" " are not strictly inverse of each other.", regr.fit, X, y) regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log) regr.set_params(check_inverse=False) assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) def func(y): out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_transform_target_regressor_functions(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform( y_tran.reshape(-1, 1)).squeeze()) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_transform_target_regressor_functions_multioutput(): X = friedman[0] y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y) assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform(y_tran)) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_rfe_importance_getter_validation(importance_getter, err_type, Selector): X, y = make_friedman1(n_samples=50, n_features=10, random_state=42) estimator = LinearSVR() log_estimator = TransformedTargetRegressor( regressor=estimator, func=np.log, inverse_func=np.exp ) with pytest.raises(err_type): model = Selector(log_estimator, importance_getter=importance_getter) model.fit(X, y)
def test_transform_target_regressor_error(): X, y = friedman # provide a transformer and functions at the same time regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=StandardScaler(), func=np.exp, inverse_func=np.log) assert_raises_regex(ValueError, "'transformer' and functions" " 'func'/'inverse_func' cannot both be set.", regr.fit, X, y) # fit with sample_weight with a regressor which does not support it sample_weight = np.ones((y.shape[0],)) regr = TransformedTargetRegressor(regressor=Lasso(), transformer=StandardScaler()) assert_raises_regex(TypeError, r"fit\(\) got an unexpected keyword " "argument 'sample_weight'", regr.fit, X, y, sample_weight=sample_weight) # func is given but inverse_func is not regr = TransformedTargetRegressor(func=np.exp) assert_raises_regex(ValueError, "When 'func' is provided, 'inverse_func'" " must also be provided", regr.fit, X, y)
def build_model(CAT_COL, CON_COL, func_name=None): ''' build model by: 1. encoding the categorical columns with OneHotEncoder 2. scaling of the numerical columns with RobustScaler (limit outliers effects) 3. create pipeline testing Ridge regressor 3.1 normalization of the target variable (None, sqrt, or log10) 4. create parameters to test through GridSearch 5. create Grid search with cross-validation --- INPUTS CAT_COL - list of categorical columns CON_COL - list of continuous columns func_name - nomalization function (str) 'sqrt' or 'log10' OUTPUT model ''' func_option = ['np.sqrt', 'np.log10'] func_inv = ['power_2', 'sp.special.exp10'] if func_name: try: func_str = 'np.' + func_name idx = [x.split('.')[-1] for x in func_option].index(func_name) inv_str = func_inv[idx] except: print(f'{func_name} is not an option') else: func_str = 'None' inv_str = 'None' preprocessor = make_column_transformer( (OneHotEncoder(drop='if_binary'), CAT_COL), (RobustScaler(), CON_COL), remainder='passthrough') pipe = make_pipeline( preprocessor, TransformedTargetRegressor(regressor=Ridge(alpha=1e-10, fit_intercept=True, solver='auto'), func=eval(func_str), inverse_func=eval(inv_str))) params = { 'transformedtargetregressor__regressor__alpha': [1e-10, 1e-5, 0.1], 'transformedtargetregressor__regressor__max_iter': [None, 100, 200], 'transformedtargetregressor__regressor__tol': [0.001, 0.01, 0.1] } model = GridSearchCV(pipe, params, cv=5) return model
def test_transform_target_regressor_route_pipeline(): X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()) estimators = [('normalize', StandardScaler()), ('est', regr)] pip = Pipeline(estimators) pip.fit(X, y, **{'est__check_input': False}) assert regr.transformer_.fit_counter == 1
def pipeline_trans_reg(): ''' Application of Transformed Linear Regression #n_quantiles needs to be smaller than the number of samples (standard is 1000) PRIMARY_MERCHANT_NAME #accuracy negative; model totally off --- AMOUNT_MEAN_LAG7 q-t R2-score: 0.896 unprocessed R2-score: 0.926 ''' transformer = QuantileTransformer(n_quantiles=750, output_distribution='normal') regressor = LinearRegression() regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) regr.fit(X_train, y_train) TransformedTargetRegressor(...) print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test))) raw_target_regr = LinearRegression().fit(X_train, y_train) print('unprocessed R2-score: {0:.3f}'.format( raw_target_regr.score(X_test, y_test))) return regr, raw_target_regr
def build_model_of(core_estimator, no_target_transform: bool = False): # tf = HomeTransformer() tf0 = load_categorical_mapping() tf = CategoricalTransformer(feature_names=None, mapping=tf0) si = SimpleImputer(strategy='most_frequent') regressor = core_estimator if not no_target_transform: regressor = TransformedTargetRegressor(regressor=core_estimator, func=np.log, inverse_func=np.exp) pipe = Pipeline(steps=[('prep', tf), ('remove_zeros', si), ('estimator', regressor)]) return pipe
def test_rfe_wrapped_estimator(importance_getter, selector, expected_n_features): # Non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/15312 X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = LinearSVR(random_state=0) log_estimator = TransformedTargetRegressor( regressor=estimator, func=np.log, inverse_func=np.exp ) selector = selector(log_estimator, importance_getter=importance_getter) sel = selector.fit(X, y) assert sel.support_.sum() == expected_n_features
def base_pls_cv(x, y, n_comps, return_model=False): pls_base = PLSRegression(n_components=n_comps) ttr_pls = TransformedTargetRegressor(regressor=pls_base, func=neg_log, inverse_func=neg_exp) y_cv = cross_val_predict(ttr_pls, x, y, cv=cv, groups=x.index) # y_cv = cross_val_predict(ttr_pls, x, y, cv=cv) score = r2_score(y, y_cv) rmsecv = mean_absolute_error(y, y_cv) if return_model == False: return (y_cv, score, rmsecv) else: return (y_cv, score, rmsecv, ttr_pls)
def main(): # Generate random data : create random points, and, keep only a subset of them. x = np.linspace(0, 10, 500) rng = np.random.RandomState(0) rng.shuffle(x) x = np.sort(x[:]) y = f(x) # Plot random data. plt.plot(x, y, 'o', color='black', markersize=2, label='random data') # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5. x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T # Polynomial regression : regression on augmented data. regrs = [] regrs.append((linear_model.LinearRegression(), 'polynomial reg')) regrs.append((neighbors.KNeighborsRegressor(15), '15-NN reg')) for regr in regrs: model, lbl = regr[0], regr[1] # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Train model. treg.fit(x_augmented, y) # Plot regression. plt.plot(x_augmented[:, 0], treg.predict(x_augmented), '-', label=lbl) plt.axis('off') plt.legend() plt.show()
def test_transform_target_regressor_invertible(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log, check_inverse=True) with pytest.warns(UserWarning, match="The provided functions or" " transformer are not strictly inverse of each other."): regr.fit(X, y) regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log) regr.set_params(check_inverse=False) assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def cv(args): gmu.must_not_exist(args.o) gmu.must_not_exist(args.r) if 'time' in str.lower(args.r): print("Performing cross-validation for time...") estimator = TransformedTargetRegressor( regressor=ExtraTreesRegressor(n_jobs=24), func=np.log, inverse_func=np.exp) elif 'power' in str.lower(args.r): print("Performing cross-validation for power...") estimator = ExtraTreesRegressor(n_jobs=24) scorer = gmu.neg_mape # TODO load yml param grid file if 'time' in str.lower(args.r): param_grid = { 'regressor__bootstrap': [False], 'regressor__max_features': [None, 'log2', 'sqrt'], 'regressor__criterion': ['mse', 'mae'], 'regressor__n_estimators': [128, 256, 512, 1024] } elif 'power' in str.lower(args.r): param_grid = { 'bootstrap': [False], 'max_features': [None, 'log2', 'sqrt'], 'criterion': ['mse', 'mae'], 'n_estimators': [128, 256, 512, 1024] } dataset = pickle.load(open(args.i, "rb")) X, y = gmu.get_xy(dataset) model, cv_scores = gmu.nested_cv(X, y, estimator, scorer, param_grid, num_trials=int(args.t), n_splits=int(args.s), n_high=int(args.k)) # for item in cv_scores: # print(pd.DataFrame(item["gs_scores"])) if args.o is not None: pickle.dump(model, open(args.o, "wb")) if args.r is not None: pickle.dump(cv_scores, open(args.r, "wb"))
def test_model_finder_predict_X_test_regression(model_finder_regression_fitted, split_dataset_numerical, limit, seed): """Testing if predictions of X_test split from found models are correct (in regression).""" models = [ SVR(**{ "C": 0.1, "tol": 1.0 }), Ridge(**{ "alpha": 0.0001, "random_state": seed }), DecisionTreeRegressor(**{ "max_depth": 10, "criterion": "mae", "random_state": seed }), ] results = [] X_train, X_test, y_train, y_test = split_dataset_numerical transformer = QuantileTransformer(output_distribution="normal", random_state=seed) for model in models: new_model = TransformedTargetRegressor(regressor=model, transformer=transformer) new_model.fit(X_train, y_train) results.append((model, new_model.predict(X_test))) expected_results = results[:limit] actual_results = model_finder_regression_fitted.predictions_X_test(limit) for actual_result, expected_result in zip(actual_results, expected_results): assert str(actual_result[0]) == str(expected_result[0]) assert np.array_equal(actual_result[1], expected_result[1])
def calculate_effort(X, Y, project, task, model_type, transformer, regressor, i_records, t_records): dummy_df = X.copy() dummy_df["Y"] = Y p_na = utils.percentage_nan(X) X.fillna(0, inplace=True) Y.fillna(0, inplace=True) # Let's create multiple regression print("\n{0} - {1} - {2} model performance: \n".format( project, task, model_type)) splits = 10 num_records = len(X) if num_records <= splits: splits = num_records pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)]) model = TransformedTargetRegressor(regressor=pipeline, transformer=transformer) model.fit(X, Y) kfold = model_selection.KFold(n_splits=splits) predictions = cross_val_predict(model, X, Y, cv=kfold) results = utils.create_percent_error_df(Y, predictions) r_squared, r_squared_adj, mae, mse, rmse, pred25, pred50 = extractPerfMeasures( model, Y, predictions, results, X) row = createDF(project, model_type, task, r_squared, r_squared_adj, mae, mse, rmse, pred25, pred50, t_records, i_records - t_records, p_na) return row
def tlr_reg(X_train, X_test, y_train, y_test): ''' Transformed Linear Regression #n_quantiles needs to be smaller than the number of samples (standard is 1000) ''' transformer = QuantileTransformer(n_quantiles=750, output_distribution='normal') regressor = LinearRegression(n_jobs=-1) #Initialize the transformed target regressor regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) regr.fit(X_train, y_train) # raw LinearRegressor for comparison raw_target_regr = LinearRegression(n_jobs=-1).fit(X_train, y_train) #Print the best value combination print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test))) print('unprocessed R2-score: {0:.3f}'.format( raw_target_regr.score(X_test, y_test))) return regr, raw_target_regr '''
def test_transform_target_regressor_1d_transformer(X, y): # All transformer in scikit-learn expect 2D data. FunctionTransformer with # validate=False lift this constraint without checking that the input is a # 2D vector. We check the consistency of the data shape using a 1D and 2D y # array. transformer = FunctionTransformer(func=lambda x: x + 1, inverse_func=lambda x: x - 1) regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_shifted_by_one(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def transformed_xgb(): xgb_regr = XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=5, subsample=0.8, booster='gbtree', objective='reg:linear', min_samples_leaf=5, n_jobs=4, random_state=42) return TransformedTargetRegressor(regressor=xgb_regr, transformer=PowerTransformer( method='yeo-johnson', standardize=True))
def __init__(self, confidence_intervals=True): # Note to self, to get parameters out: model.diagnosis_model.named_steps['scaler'].mean_ self.diagnosis_model = Pipeline([ ('scaler', StandardScaler()), ('classifier', svm.SVC(kernel='rbf', C=0.5, gamma='auto', class_weight='balanced', probability=True)), ]) adas_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))]) self.adas_model = TransformedTargetRegressor( regressor=adas_pipeline, transformer=StandardScaler()) ventricles_pipeline = Pipeline( steps=[('scaler', StandardScaler()), ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))]) self.ventricles_model = TransformedTargetRegressor( regressor=ventricles_pipeline, transformer=StandardScaler()) self.y_diagnosis = None self.y_adas = None self.y_ventricles = None self.train_df_diagnosis = None self.train_df_adas = None self.train_df_ventricles = None self.confidence_intervals = confidence_intervals self.train_df_processed = None self.test_df_processed = None
def test_transform_target_regressor_pass_extra_predict_parameters(): # Checks that predict kwargs are passed to regressor. X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer() ) regr.fit(X, y) regr.predict(X, check_input=False) assert regr.regressor_.predict_called
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv # gridpoints=self.gridpoints transformer_list = [ none_T(), log_T(), logp1_T() ] # Using 3 of many options here: none_T,logp1_T(),log_T() steps = [ ('shrink_k1', shrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=32)) ), # retain a subset of the best original variables ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2)), # create interactions among them ('drop_constant', dropConst()), ('shrink_k2', shrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=64)) ), # pick from all of those options ('reg', LinearRegression()) ] if self.bestT: steps.insert(0, ('xtransform', columnBestTransformer(float_k=len(self.float_idx)))) X_T_pipe = Pipeline(steps=steps) #develop a new pipeline that allows transformation of y in addition to X, which other scikit learn transformers don't Y_T_X_T_pipe = Pipeline( steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))]) Y_T__param_grid = { 'ttr__transformer': transformer_list, 'ttr__regressor__polyfeat__degree': [2], #could use other degrees here if desired } outerpipe = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def wrap_target_scaler(self, target_scaler, model_obj, model_params): if self.spec['problem_type'] != 'regression': return model_obj, model_params if target_scaler is None: return model_obj, model_params # Process and get the base scaler_obj + params base_scaler_obj = TargetScalerConstructor(self.user_passed_objs, self.dataset, self.spec) scaler_objs, scaler_params =\ base_scaler_obj.process(target_scaler) scaler_obj = scaler_objs[0] # Unwrap into name + base model_name, base_model = model_obj[0], model_obj[1] scaler_name, base_scaler = scaler_obj[0], scaler_obj[1] # Now, wrap the model + scaler in the transformed target regressor base_wrapper_model =\ TransformedTargetRegressor(regressor=base_model, transformer=base_scaler) wrapped_name = 'scale_target_' + model_name wrapper_model_obj = (wrapped_name, base_wrapper_model) # Need to update model params with new nested model name model_param_names = list(model_params) for param_name in model_param_names: if param_name.startswith(model_name + '__'): new_base = wrapped_name + '__regressor__' new_param_name =\ param_name.replace(model_name + '__', new_base, 1) model_params[new_param_name] = model_params.pop(param_name) # Need to also update / add any scaler params for param_name in scaler_params: if param_name.startswith(scaler_name + '__'): new_base = wrapped_name + '__transformer__' new_param_name =\ param_name.replace(scaler_name + '__', new_base, 1) model_params[new_param_name] = scaler_params[param_name] return wrapper_model_obj, model_params
def get_log_spiral_pipeline(): names = ('polynomialfeatures', 'bayesianridge') steps = [ PolynomialFeatures( degree=1, include_bias=False, ), TransformedTargetRegressor(regressor=BayesianRidge(compute_score=True, fit_intercept=True, copy_X=True, normalize=True, **clf_kwargs), func=np.log, inverse_func=np.exp) ] return Pipeline(memory=None, steps=list(zip(names, steps)))
def test_transform_target_regressor_ensure_y_array(): # check that the target ``y`` passed to the transformer will always be a # numpy array. Similarly, if ``X`` is passed as a list, we check that the # predictor receive as it is. X, y = friedman tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(), regressor=DummyCheckerListRegressor(), check_inverse=False) tt.fit(X.tolist(), y.tolist()) tt.predict(X.tolist()) assert_raises(AssertionError, tt.fit, X, y.tolist()) assert_raises(AssertionError, tt.predict, X)
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0) else: inner_cv = self.inner_cv gridpoints = self.gridpoints transformer_list = [None_T(), Log_T(), LogP1_T()] # ,logp1_T()] # log_T()]# steps = [ ('shrink_k1', ShrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=32))), # retain a subset of the best original variables ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2) ), # create interactions among them ('drop_constant', DropConst()), ('shrink_k2', ShrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=64))), # pick from all of those options ('reg', LinearRegression()) ] if self.bestT: steps.insert(0, ('xtransform', ColumnBestTransformer(float_k=len(self.float_idx)))) X_T_pipe = Pipeline(steps=steps) Y_T_X_T_pipe = Pipeline( steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))]) Y_T__param_grid = { 'ttr__transformer': transformer_list, 'ttr__regressor__polyfeat__degree': [2], } outerpipe = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv) if self.do_prep: steps = [('prep', MissingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def test(): import numpy as np import scipy as sp import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge from sklearn.compose import TransformedTargetRegressor # data survey = fetch_openml(data_id=534, as_frame=True) X = survey.data[survey.feature_names] y = survey.target.values.ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) # _ = sns.pairplot(train_dataset, kind = "reg", diag_kind = "kde") # machine learning pipeline categorical_columns = [ "RACE", "OCCUPATION", "SECTOR", "MARR", "UNION", "SEX", "SOUTH" ] numerical_columns = ["EDUCATION", "EXPERIENCE", "AGE"] preprocessor = make_column_transformer( (OneHotEncoder(drop="if_binary"), categorical_columns), remainder="passthrough", verbose_feature_names_out=False) model = make_pipeline( preprocessor, TransformedTargetRegressor(regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10), ) # processing the data _ = model.fit(X_train, y_train)
def ols_prediction(self): """ uses linear regression after standardising to normal dist prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file also creates another column to calculate relative percentage difference between y and predicted y :return: """ logger.info("running Linear Regression model") crab_df_woo = self.pre_process_data() transformer = QuantileTransformer(output_distribution='normal') # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist reg = linear_model.LinearRegression() t_reg = TransformedTargetRegressor(regressor=reg, transformer=transformer) ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True, drop_invariant=True) crab_df_woo_enc = ohe.fit_transform(crab_df_woo) X = crab_df_woo_enc.drop("age", axis=1) y = crab_df_woo_enc[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) t_reg.fit(X_train, y_train) s = t_reg.score(X_test, y_test) logger.info("R-squared from Linear Regression is: {0}".format(s)) y_pred = t_reg.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("Linear Regression MAE: {0}".format(mae)) logger.debug("Linear Regression RMSE: {0}".format(mse)) logger.debug("Linear Regression R-squared: {0}".format(s)) crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_ols"] = pd.Series(y_pred.ravel()) crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row), axis=1) crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_ols.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_ols.csv")) logger.info("Linear Regression execution finished")
def rf_prediction(self): """ uses ensemble (Random Forest) method to predict crab age :return: """ logger.info("running Random Forest model") X = self.crab_data.drop("age", axis=1) y = self.crab_data[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) # numerical_features = X_train.dtypes == 'float' categorical_features = ~numerical_features # I used pipelining so that the predicted values were automatically transformed/scaled back preprocess = make_column_transformer( (RobustScaler(), numerical_features), (OneHotEncoder(sparse=False), categorical_features)) forest = RandomForestRegressor(n_estimators=5000, max_depth=20, min_samples_leaf=2, min_samples_split=4, random_state=100) f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)]) f_reg_ttr = TransformedTargetRegressor(regressor=f_reg) f_reg_ttr.fit(X_train, y_train) s = f_reg_ttr.score(X_test, y_test) logger.info("R-squared from Random Forest is: {0}".format(s)) y_pred = f_reg_ttr.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("RandomForest MAE: {0}".format(mae)) logger.debug("RandomForest RMSE: {0}".format(mse)) logger.debug("RandomForest R-squared: {0}".format(s)) # recreate the original dataset crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_forest"] = pd.Series(y_pred.ravel()) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_forest.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_forest.csv")) logger.info("Random Forest execution finished")
def train_gpr(l=None): # basic no tuning. sklearn gp is not great for this. if l is None: l = get_data() model = GaussianProcessRegressor( alpha=1.8, copy_X_train=True, # kernel=kernels.RBF(4.85 * np.array([4, 3000])), # kernel=kernels.RBF([1, 1]), n_restarts_optimizer=10, normalize_y=True, optimizer='fmin_l_bfgs_b', random_state=None) model = TransformedTargetRegressor( regressor=model, transformer=QuantileTransformer(output_distribution='normal')) steps = [('copulize_x', QuantileTransformer(output_distribution='uniform')), ('gpr', model)] model = Pipeline(steps) model.fit(l.X_train.values, l.y_train.values.squeeze()) return attributedict_from_locals('model')
def train_svm(l=None): # basic no tuning if l is None: l = get_data() model = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) model = TransformedTargetRegressor( regressor=model, transformer=QuantileTransformer(output_distribution='normal')) # model = LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True, # intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000, # random_state=None, tol=0.0001, verbose=0) model.fit(l.X_train.values, l.y_train.values.squeeze()) return attributedict_from_locals('model')
def set_pipeline(self): warnings.simplefilter('ignore') transformer_list = [None_T(), LogP1_T()] steps = [('scaler', StandardScaler()), ('shrink_k1', ShrinkBigKTransformer()), ('polyfeat', PolynomialFeatures(interaction_only=1)), ('shrink_k2', ShrinkBigKTransformer(selector='elastic-net')), ('reg', make_pipeline(StandardScaler(), LinearRegression(fit_intercept=1)))] inner_params = {'polyfeat__degree': [2]} if self.k > 4: interv = -(-self.k // 3) np.arange(2, self.k + interv, interv) inner_params['shrink_k1__max_k'] = np.arange(4, self.k, 4) inner_cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=self.seed) X_T_pipe = GridSearchCV(Pipeline(steps=steps), param_grid=inner_params, cv=inner_cv) Y_T_X_T_pipe = Pipeline( steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))]) Y_T__param_grid = {'ttr__transformer': transformer_list} lin_reg_Xy_transform = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv) self.lr_estimator = lin_reg_Xy_transform self.lr_estimator.fit(self.x_train, self.y_train) self.attr = pd.DataFrame(self.lr_estimator.cv_results_) # generates the model that is saved logger.info("Total execution time: {} sec".format( round(time.time() - self.start_time, 3)))
regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) f.suptitle("Synthetic data", y=0.035)