def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) def func(y): out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_transform_target_regressor_count_fit(check_inverse): # regression test for gh-issue #11618 # check that we only call a single time fit for the transformer X, y = friedman ttr = TransformedTargetRegressor(transformer=DummyTransformer(), check_inverse=check_inverse) ttr.fit(X, y) assert ttr.transformer_.fit_counter == 1
def test_transform_target_regressor_pass_fit_parameters(): X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()) regr.fit(X, y, check_input=False) assert regr.transformer_.fit_counter == 1
def test_transform_target_regressor_count_fit(check_inverse): # regression test for gh-issue #11618 # check that we only call a single time fit for the transformer X, y = friedman ttr = TransformedTargetRegressor( transformer=DummyTransformer(), check_inverse=check_inverse ) ttr.fit(X, y) assert ttr.transformer_.fit_counter == 1
def predict_by_pos(pos, year): features_list = ['g', 'gs', 'mp_per_g', 'fg_per_g', 'fga_per_g', 'fg_pct', 'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct', 'ft_per_g', 'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g', 'pts_per_g', 'tenure', 'height', 'weight', 'sos', 'srs', 'ows', 'dws', 'ws', 'ts_pct', 'usg_pct', 'bpm', 'pprod'] X = df[(df['pos'] == pos) & (df['is_final_year'])] X = X[features_list] X_imp = IterativeImputer(max_iter=10).fit_transform(X) X = pd.DataFrame(X_imp, index=X.index, columns=X.columns) df.loc[X.index, X.columns] = X X = df[(df['is_final_year']) & (df['pos'] == pos) & (df['mp_per_g'] > 15) & (df['g'] > 25)][features_list] #X['per'] = (1/X['mp_per_g']) * ((X['fg_per_g'] * 85.91) + (X['stl_per_g'] * 53.897) + (X['fg3_per_g'] * 51.757) + (X['ft_per_g'] * 46.845) + (X['blk_per_g'] * 39.19) + (X['orb_per_g'] * 39.19) + (X['ast_per_g'] * 34.677) + (X['drb_per_g'] * 14.707) - (X['pf_per_g'] * 17.174) - (X['fta_per_g'] - (X['ft_per_g'])*20.091) - ((X['fga_per_g'] - X['fg_per_g'])*39.19) - (X['tov_per_g']*53.897)) X = (X - X.min()) / (X.max() - X.min()) predicted_to_nba = pd.DataFrame() for yr in range(1996, 2020): a = predict_make_nba(yr, X) predicted_to_nba = predicted_to_nba.append(a) ################################################## ##PER Regression## #train algorithm on players not in given year clf1 = SGDRegressor(alpha=.01, penalty='elasticnet') features_list = X.columns.tolist() #create dataframe of NCAA players that made NBA df2 = predicted_to_nba X2 = transform_train_data(df2[features_list]) y2 = df2[['mean_per']].loc[X2.index] to_drop = list(X2.columns[X2.var() < .1]) to_drop += ['gs'] X2.drop(to_drop, axis=1, inplace=True) X2 = (X2 - X2.mean())/X2.std() X_new_pred = X2[df2.loc[X2.index]['year'] == year] X2 = X2[(df2.loc[X2.index]['year'] != year) & (df2.loc[X2.index]['year'] < 2018) & (df2.loc[X2.index]['year'] > 1995)] y2 = y2.loc[X2.index] y_new_pred = df2[['mean_per']].loc[X_new_pred.index] y_new_pred = (y_new_pred - y2.mean())/y2.std() y2 = (y2 - y2.mean())/y2.std() X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, stratify=df2.loc[y2.index]['tier']) clf2 = TransformedTargetRegressor(clf1) clf2.fit(X2_train, y2_train) #predict per for players in given year X_new_pred = X_new_pred[X2.columns.tolist()] new_pred = clf2.predict(X_new_pred) new_pred_curr_year = pd.DataFrame(new_pred, index=X_new_pred.index).merge(df.iloc[:, :-8], left_index=True, right_index=True) return new_pred_curr_year
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000): from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV from joblib import Memory from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import PowerTransformer # Find alpha range alphas = find_alpha_range(X, y, n_alphas=n_alphas) list_y_pred = [] list_y_true = [] list_models = [] for train_index, test_index in tqdm(cv_outer.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] list_y_true.append(y_test) cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0) tmpfolder = mkdtemp() memory = Memory(location=tmpfolder) pip = make_pipeline(VarianceThreshold(), PCA(), Ridge(max_iter=1e6), memory=memory) grid = GridSearchCV(pip, param_grid={'ridge__alpha': alphas}, cv=cv_inner, n_jobs=-1, scoring="neg_mean_squared_error") regr_trans = TransformedTargetRegressor( regressor=grid, transformer=PowerTransformer(method=transform)) regr_trans.fit(X_train, y_train) list_models.append(regr_trans) y_pred = regr_trans.predict(X_test) list_y_pred.append(y_pred) memory.clear(warn=False) shutil.rmtree(tmpfolder) y_pred = np.concatenate(list_y_pred) y_true = np.concatenate(list_y_true) return y_pred, y_true, list_models
def test_transform_target_regressor_pass_extra_predict_parameters(): # Checks that predict kwargs are passed to regressor. X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer() ) regr.fit(X, y) regr.predict(X, check_input=False) assert regr.regressor_.predict_called
def main(): # Generate random data : create random points, and, keep only a subset of them. x = np.linspace(0, 10, 500) rng = np.random.RandomState(0) rng.shuffle(x) x = np.sort(x[:]) y = f(x) # Create bagging and random forest models. fig, axes = plt.subplots(2, 2, figsize=(20, 10)) models = [AdaBoostRegressor(n_estimators=5, base_estimator=KNeighborsRegressor()), AdaBoostRegressor(n_estimators=5, base_estimator=SVR()), AdaBoostRegressor(n_estimators=5, base_estimator=KernelRidge(kernel='rbf')), GradientBoostingRegressor()] for axis, model in zip(axes.ravel(), models): # Set title. title = model.__class__.__name__ reg_params = model.get_params() if 'base_estimator' in reg_params: # GradientBoostingRegressor has no 'base_estimator'. title += ', estimator: '+reg_params['base_estimator'].__class__.__name__ axis.set_title(title) # Plot random data. axis.plot(x, y, 'o', color='black', markersize=2, label='random data') # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5. x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline([('scale', preprocessing.StandardScaler()), ('model', model)]) # Data scaling applied before / after any operator applied to the model. y_transformer = preprocessing.MinMaxScaler().fit(y.reshape(-1, 1)) treg = TransformedTargetRegressor(regressor=pipe, transformer=y_transformer) # Target scaling applied before / after any operator applied to the model. # Train model. treg.fit(x_augmented, y) # Plot intermediate regression estimations. if isinstance(model, AdaBoostRegressor): for i, tree in enumerate(treg.regressor_['model'].estimators_): x_augmented_scaled = treg.regressor_['scale'].transform(x_augmented) # x input after scaling (as tree does not use Pipeline). y_hat = tree.predict(x_augmented_scaled) # y outcome before scaling (as tree does not use TransformedTargetRegressor). y_pred = y_transformer.inverse_transform(y_hat.reshape(-1, 1)) axis.plot(x, y_pred, '--', label='tree '+str(i)) axis.axis('off') axis.legend() # Plot final regression. axis.plot(x, treg.predict(x_augmented), '-', color='black', label=model.__class__.__name__) axis.axis('off') axis.legend() plt.show()
def test_transform_target_regressor_ensure_y_array(): # check that the target ``y`` passed to the transformer will always be a # numpy array. Similarly, if ``X`` is passed as a list, we check that the # predictor receive as it is. X, y = friedman tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(), regressor=DummyCheckerListRegressor(), check_inverse=False) tt.fit(X.tolist(), y.tolist()) tt.predict(X.tolist()) assert_raises(AssertionError, tt.fit, X, y.tolist()) assert_raises(AssertionError, tt.predict, X)
class SemiSup_RandomizedSearchCV(BaseEstimator): def __init__(self, estimator, param_distributions, n_iter=100, cv=5, scoring=metrics.accuracy_score, pseudo=True): # We initialize our class similar to sklearn randomized search self.estimator = estimator self.scoring = scoring self.pseudo = pseudo self.transformedtargetestimator = TransformedTargetRegressor(regressor=estimator, func=lambda x: x if np.random.rand() > 1/cv else -1, inverse_func=lambda x: x, check_inverse=False) self.scoring = scoring self.sampler = ParameterSampler(param_distributions, n_iter) self.cv_results_ = pd.DataFrame({'mean_test_score': np.empty(shape=[0]), 'std_test_score': np.empty(shape=[0]), 'mean_score_time': np.empty(shape=[0]), 'std_score_time': np.empty(shape=[0]), 'params': None}) self.folds = KFold(n_splits=cv) def fit(self, X, y, sample_weight=None): for params in self.sampler: # Update Parameters self.estimator.set_params(**params) # Reset Scores scores = [] times = [] for train_index, test_index in self.folds.split(X): #Create Semisupervised Sampler self.transformedtargetestimator = TransformedTargetRegressor(regressor=self.estimator, func=lambda x: np.where(np.in1d(x.index,train_index),x,-1), inverse_func=lambda x: x, check_inverse=False) #Fit if self.pseudo: self.transformedtargetestimator.regressor.pseudo_fit = pseudo_fit.__get__(self.transformedtargetestimator.regressor) self.transformedtargetestimator = self.transformedtargetestimator.regressor.pseudo_fit(X, self.transformedtargetestimator.func(y)) else: self.transformedtargetestimator.fit(X, y, sample_weight) #Score score_index = np.in1d(y.index,test_index) start = time() scores.append(self.scoring(y[score_index], self.transformedtargetestimator.predict(X=X[score_index]))) times.append(time()-start) self.cv_results_ = self.cv_results_.append(pd.DataFrame({'mean_test_score': np.mean(scores), 'std_test_score': np.std(scores), 'mean_score_time': np.mean(times), 'std_score_time': np.std(times), 'params': [params]})) self.cv_results_ = self.cv_results_.sort_values('mean_test_score', ascending=False).reset_index(drop=True) return self
class TransformedTargetRegressorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_transform_target_regressor_invertible(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log, check_inverse=True) with pytest.warns(UserWarning, match="The provided functions or" " transformer are not strictly inverse of each other."): regr.fit(X, y) regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log) regr.set_params(check_inverse=False) assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_2d_transformer(X, y): # Check consistency with transformer accepting only 2D array and a 1D/2D y # array. transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform if y.ndim == 1: # create a 2D array and squeeze results y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() else: y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) if y.ndim == 1: # create a 2D array and squeeze results lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) else: lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_2d_transformer(X, y): # Check consistency with transformer accepting only 2D array and a 1D/2D y # array. transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform if y.ndim == 1: # create a 2D array and squeeze results y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() else: y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) if y.ndim == 1: # create a 2D array and squeeze results lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) else: lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_1d_transformer(X, y): # All transformer in scikit-learn expect 2D data. FunctionTransformer with # validate=False lift this constraint without checking that the input is a # 2D vector. We check the consistency of the data shape using a 1D and 2D y # array. transformer = FunctionTransformer( func=lambda x: x + 1, inverse_func=lambda x: x - 1 ) regr = TransformedTargetRegressor( regressor=LinearRegression(), transformer=transformer ) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_shifted_by_one(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_1d_transformer(X, y): # All transformer in scikit-learn expect 2D data. FunctionTransformer with # validate=False lift this constraint without checking that the input is a # 2D vector. We check the consistency of the data shape using a 1D and 2D y # array. transformer = FunctionTransformer(func=lambda x: x + 1, inverse_func=lambda x: x - 1, validate=False) regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_shifted_by_one(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def rf_prediction(self): """ uses ensemble (Random Forest) method to predict crab age :return: """ logger.info("running Random Forest model") X = self.crab_data.drop("age", axis=1) y = self.crab_data[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) # numerical_features = X_train.dtypes == 'float' categorical_features = ~numerical_features # I used pipelining so that the predicted values were automatically transformed/scaled back preprocess = make_column_transformer( (RobustScaler(), numerical_features), (OneHotEncoder(sparse=False), categorical_features)) forest = RandomForestRegressor(n_estimators=5000, max_depth=20, min_samples_leaf=2, min_samples_split=4, random_state=100) f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)]) f_reg_ttr = TransformedTargetRegressor(regressor=f_reg) f_reg_ttr.fit(X_train, y_train) s = f_reg_ttr.score(X_test, y_test) logger.info("R-squared from Random Forest is: {0}".format(s)) y_pred = f_reg_ttr.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("RandomForest MAE: {0}".format(mae)) logger.debug("RandomForest RMSE: {0}".format(mse)) logger.debug("RandomForest R-squared: {0}".format(s)) # recreate the original dataset crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_forest"] = pd.Series(y_pred.ravel()) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_forest.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_forest.csv")) logger.info("Random Forest execution finished")
def ols_prediction(self): """ uses linear regression after standardising to normal dist prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file also creates another column to calculate relative percentage difference between y and predicted y :return: """ logger.info("running Linear Regression model") crab_df_woo = self.pre_process_data() transformer = QuantileTransformer(output_distribution='normal') # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist reg = linear_model.LinearRegression() t_reg = TransformedTargetRegressor(regressor=reg, transformer=transformer) ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True, drop_invariant=True) crab_df_woo_enc = ohe.fit_transform(crab_df_woo) X = crab_df_woo_enc.drop("age", axis=1) y = crab_df_woo_enc[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) t_reg.fit(X_train, y_train) s = t_reg.score(X_test, y_test) logger.info("R-squared from Linear Regression is: {0}".format(s)) y_pred = t_reg.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("Linear Regression MAE: {0}".format(mae)) logger.debug("Linear Regression RMSE: {0}".format(mse)) logger.debug("Linear Regression R-squared: {0}".format(s)) crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_ols"] = pd.Series(y_pred.ravel()) crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row), axis=1) crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_ols.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_ols.csv")) logger.info("Linear Regression execution finished")
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000): from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.linear_model import Lasso from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import PowerTransformer y_trans = PowerTransformer(method=transform).fit_transform( y[:, None]).flatten() alphas = find_alpha_range(X, y_trans, n_alphas=n_alphas) list_y_pred = [] list_y_true = [] list_models = [] for train_index, test_index in tqdm(cv_outer.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] list_y_true.append(y_test) cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0) lasso_pcr = LassoPCR(scale=False, cv=cv_inner, n_jobs=-1, alphas=alphas, lasso_kws={'max_iter': 1e6}, scoring="neg_mean_squared_error") regr_trans = TransformedTargetRegressor( regressor=lasso_pcr, transformer=PowerTransformer(method=transform)) regr_trans.fit(X_train, y_train) list_models.append(regr_trans) y_pred = regr_trans.predict(X_test) list_y_pred.append(y_pred) y_pred = np.concatenate(list_y_pred) y_true = np.concatenate(list_y_true) return y_pred, y_true, list_models
def create_scatter_df(profile, threshold): """ INPUT: profile: profile dataframe threshold: Threshold to remove data that can be identified as outliers DESCRIPTION: Function to remove outliers from profile dataset and predict the spendings of the customers. OUTPUT: result: DataFrame with predictions and actual values mse: The mean squared error of the predictions """ scaler = MinMaxScaler() prediction_df = profile[[ "age", "income", "memberdays", "gender_F", "gender_M", "overall_spendings" ]] prediction_df[["age", "income", "memberdays"]] = scaler.fit_transform( prediction_df[["age", "income", "memberdays"]]) prediction_df = prediction_df[ prediction_df["overall_spendings"] < threshold] X = prediction_df.drop("overall_spendings", axis=1) y = prediction_df["overall_spendings"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) y_test.reset_index(drop=True, inplace=True) result = pd.concat([y_test, pd.Series(y_pred)], axis=1) result.rename(columns={ 0: "prediction", "overall_spendings": "actual_value" }, inplace=True) mse = mean_squared_error(y_test, y_pred) return result, mse
def run_experiment( exp_name, models, folds, train_seasons, test_seasons, X, y, preprocessor=None, #print_exp_progress=None, calculate_metrics_func=calculate_clf_metrics, algorithm_type='clf'): results = [] names = [] print("Running experiment", exp_name) for name, current_model in models: cv_results = defaultdict(list) for train_idx, test_idx in folds: X_train, X_test = X.loc[train_idx], X.loc[test_idx] #X_train, X_test = utils.scale_X(X, y, train_idx, test_idx) y_train, y_test = y.loc[train_idx], y.loc[test_idx] y_true = y_test pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', current_model)]) if algorithm_type == 'reg': model = TransformedTargetRegressor( regressor=pipeline, transformer=StandardScaler()) else: model = pipeline fit_info = model.fit(X_train, y_train) y_pred = model.predict(X_test) fold_metric_results = calculate_metrics_func(y_true, y_pred) for key, value in fold_metric_results.items(): cv_results[key].append(value) exp_result = { "exp_name": exp_name, "model": name, **agg_metrics(cv_results.keys(), cv_results) } if algorithm_type == 'reg': reg_exp_results.append(exp_result) else: exp_results.append(exp_result) cv_results["model"] = [name] * len(folds) cv_results["season_train"] = train_seasons cv_results["season_test"] = test_seasons results.append(cv_results) names.append(name) print("Done") return names, results
def regression(data): # Stworzenie zbioru treningowego i testowego train_set, test_set = train_test_split(data, test_size=0.2, random_state=42) # Stworzenie zestawów cech i etykiet X_train = train_set.loc[:, :'pressure_max^3'] y_train = train_set['grid'] X_test = test_set.loc[:, :'pressure_max^3'] y_test = test_set['grid'] # print("regresja liniowa") # # Model regresji liniowej # lin_reg = TransformedTargetRegressor(regressor=LinearRegression(), transformer=MinMaxScaler()) # lin_reg.fit(X_train, y_train) # lin_reg_result = train_eval_model(lin_reg, "Transformed Linear Regressor", X_train, y_train, X_test, y_test) # print("SGD") # # SGD # sgd_reg = TransformedTargetRegressor(regressor=SGDRegressor(), transformer=MinMaxScaler()) # sgd_reg.fit(X_train, y_train) # sgd_reg_result = train_eval_model(sgd_reg, "Transformed SGD Regressor", X_train, y_train, X_test, y_test) print("las losowy n_estimators = 500") # Model Lasu Losowego forest_reg = TransformedTargetRegressor(regressor=RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=500), transformer=MinMaxScaler()) forest_reg.fit(X_train, y_train) forest_result = train_eval_model(forest_reg, "Random Forest Regressor", X_train, y_train, X_test, y_test) # print('svr') # # SVR # svm_reg = TransformedTargetRegressor(regressor=LinearSVR(random_state=42, dual=False), transformer=MinMaxScaler()) # svm_reg.fit(X_train, y_train) # svm_results = train_eval_model(svm_reg, 'SVM-rbf', X_train, y_train, X_test, y_test) # print('nn') # # NN # nn_reg = TransformedTargetRegressor(regressor=MLPRegressor(random_state=42), transformer=MinMaxScaler()) # nn_reg.fit(X_train, y_train) # nn_results = train_eval_model(nn_reg, 'NN', X_train, y_train, X_test, y_test) final_results = { "RandomForestRegressor": forest_result, # "SvmRbfRegressor": svm_results, # "MLPRegressor": nn_results } return final_results
def run(): # -- LOAD FILE -- # filename = "Data/JabRef_train.csv" df = pd.read_csv(filename, dtype=object) df.dropna(inplace=True) # # - Set target - # y = df["set_clicked"] # # --- Label Encoder --- # encoder = preprocessing.LabelEncoder() df = df.apply(encoder.fit_transform) # # -- Get relevant columns -- # cor = df.corr() cor_target = abs(cor["set_clicked"]) relevant_features = cor_target[cor_target > 0.01] print(relevant_features.index) # # -- Normal Distribution, Reduce impact of outliers -- # transformer = QuantileTransformer(output_distribution="normal") X = df[relevant_features.index] X = X.drop(["set_clicked"], 1) regressor = LinearRegression() regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred}) print(df) print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}") y = y_test.to_numpy(dtype=int) print(f"F1 Score: {f1_score(y, y_pred)}") df[["Actual", "Predicted"]].to_csv("david_results.csv")
def test_transform_target_regressor_error(): X, y = friedman # provide a transformer and functions at the same time regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=StandardScaler(), func=np.exp, inverse_func=np.log) with pytest.raises(ValueError, match="'transformer' and functions" " 'func'/'inverse_func' cannot both be set."): regr.fit(X, y) # fit with sample_weight with a regressor which does not support it sample_weight = np.ones((y.shape[0], )) regr = TransformedTargetRegressor(regressor=OrthogonalMatchingPursuit(), transformer=StandardScaler()) with pytest.raises(TypeError, match=r"fit\(\) got an unexpected " "keyword argument 'sample_weight'"): regr.fit(X, y, sample_weight=sample_weight) # func is given but inverse_func is not regr = TransformedTargetRegressor(func=np.exp) with pytest.raises(ValueError, match="When 'func' is provided, " "'inverse_func' must also be provided"): regr.fit(X, y)
def main(): # Generate random data : create random points, and, keep only a subset of them. x = np.linspace(0, 10, 500) rng = np.random.RandomState(0) rng.shuffle(x) x = np.sort(x[:]) y = f(x) # Plot random data. plt.plot(x, y, 'o', color='black', markersize=2, label='random data') # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5. x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T # Polynomial regression : regression on augmented data. regrs = [] regrs.append((linear_model.LinearRegression(), 'polynomial reg')) regrs.append((neighbors.KNeighborsRegressor(15), '15-NN reg')) for regr in regrs: model, lbl = regr[0], regr[1] # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Train model. treg.fit(x_augmented, y) # Plot regression. plt.plot(x_augmented[:, 0], treg.predict(x_augmented), '-', label=lbl) plt.axis('off') plt.legend() plt.show()
class Regressor(BaseEstimator): def __init__(self): self.MYReg = TransformedTargetRegressor( regressor=RandomForestRegressor(n_estimators=30, max_depth=12), func=lambda u: np.log10(np.clip(u, a_min=1, a_max=None)), inverse_func=lambda u: np.power(10, u), check_inverse=False, ) def fit(self, X, y): return self.MYReg.fit(X, y) def predict(self, X): return self.MYReg.predict(X)
def calculate_effort(X, Y, project, task, model_type, transformer, regressor, i_records, t_records): dummy_df = X.copy() dummy_df["Y"] = Y p_na = utils.percentage_nan(X) X.fillna(0, inplace=True) Y.fillna(0, inplace=True) # Let's create multiple regression print("\n{0} - {1} - {2} model performance: \n".format( project, task, model_type)) splits = 10 num_records = len(X) if num_records <= splits: splits = num_records pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)]) model = TransformedTargetRegressor(regressor=pipeline, transformer=transformer) model.fit(X, Y) kfold = model_selection.KFold(n_splits=splits) predictions = cross_val_predict(model, X, Y, cv=kfold) results = utils.create_percent_error_df(Y, predictions) r_squared, r_squared_adj, mae, mse, rmse, pred25, pred50 = extractPerfMeasures( model, Y, predictions, results, X) row = createDF(project, model_type, task, r_squared, r_squared_adj, mae, mse, rmse, pred25, pred50, t_records, i_records - t_records, p_na) return row
def test_model_finder_predict_X_test_regression(model_finder_regression_fitted, split_dataset_numerical, limit, seed): """Testing if predictions of X_test split from found models are correct (in regression).""" models = [ SVR(**{ "C": 0.1, "tol": 1.0 }), Ridge(**{ "alpha": 0.0001, "random_state": seed }), DecisionTreeRegressor(**{ "max_depth": 10, "criterion": "mae", "random_state": seed }), ] results = [] X_train, X_test, y_train, y_test = split_dataset_numerical transformer = QuantileTransformer(output_distribution="normal", random_state=seed) for model in models: new_model = TransformedTargetRegressor(regressor=model, transformer=transformer) new_model.fit(X_train, y_train) results.append((model, new_model.predict(X_test))) expected_results = results[:limit] actual_results = model_finder_regression_fitted.predictions_X_test(limit) for actual_result, expected_result in zip(actual_results, expected_results): assert str(actual_result[0]) == str(expected_result[0]) assert np.array_equal(actual_result[1], expected_result[1])
def test_transform_target_regressor_functions_multioutput(): X = friedman[0] y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y) assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform(y_tran)) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_transform_target_regressor_functions(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform( y_tran.reshape(-1, 1)).squeeze()) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def tlr_reg(X_train, X_test, y_train, y_test): ''' Transformed Linear Regression #n_quantiles needs to be smaller than the number of samples (standard is 1000) ''' transformer = QuantileTransformer(n_quantiles=750, output_distribution='normal') regressor = LinearRegression(n_jobs=-1) #Initialize the transformed target regressor regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) regr.fit(X_train, y_train) # raw LinearRegressor for comparison raw_target_regr = LinearRegression(n_jobs=-1).fit(X_train, y_train) #Print the best value combination print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test))) print('unprocessed R2-score: {0:.3f}'.format( raw_target_regr.score(X_test, y_test))) return regr, raw_target_regr '''
def test_transform_target_regressor_3d_target(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/18866 # Check with a 3D target with a transformer that reshapes the target X = friedman[0] y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2]) def flatten_data(data): return data.reshape(data.shape[0], -1) def unflatten_data(data): return data.reshape(data.shape[0], -1, 2) transformer = FunctionTransformer(func=flatten_data, inverse_func=unflatten_data) regr = TransformedTargetRegressor( regressor=LinearRegression(), transformer=transformer ) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform( y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) f.suptitle("Synthetic data", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
class WrappedModelRegression: """Wrapper for Models in Regression problems. Models get wrapped with TransformedTargetRegressor to transform y target before predictions on X features take place. Wrapper additionally customizes __name__, __class__ and __str__ methods/attributes to return those values from main Model (not TransformedTargetRegressor). Attributes: clf (sklearn.compose.TransformedTargetRegressor): Wrapped model for regression problems """ def __init__(self, regressor, transformer): """Create WrappedModelRegression object. Override __name__ and __class__ attributes with appropriate attributes from regressor. Args: regressor (sklearn.Model): Model used to predict regression target transformer (sklearn.Transformer): Transformer used to transform y (target) """ self.clf = TransformedTargetRegressor(regressor=regressor, transformer=transformer) self.__name__ = self.clf.regressor.__class__.__name__ self.__class__ = self.clf.regressor.__class__ def fit(self, *args, **kwargs): """Fit Model in clf attribute with provided arguments. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: self """ self.clf.fit(*args, **kwargs) return self def predict(self, *args, **kwargs): """Predict provided arguments with Model in clf attribute. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: numpy.ndarray: predictions """ return self.clf.predict(*args, **kwargs) def get_params(self, *args, **kwargs): """Return params of regressor inside wrapped clf Model. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: dict: params of regressor """ return self.clf.regressor.get_params(*args, **kwargs) def __str__(self): """Return __str__method of regressor inside wrapped clf Model. Returns: str: __str__ method of regressor """ return self.clf.regressor.__str__() def __class__(self, *args, **kwargs): """Return new object of regressor class instantiated with *args and **kwargs arguments. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: regressor: new regressor object """ return self.clf.regressor.__class__(*args, **kwargs)