def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) def func(y): out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1]**2 + 1)]) def func(y): out = np.sqrt(y[:, 0]**2 + y[:, 1]**2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0]**2 + y[:, 1]**2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def predict_deaths(self, s): s_train = s.loc[:self.last_training_year] num_predict = self.pred_year - self.last_training_year n_train = len(s_train) n = n_train + num_predict # predict into the future y = s_train.values x = np.arange(n) X = x.reshape(-1, 1) X2 = np.column_stack((x, x**2, x**3)) X_train = X[:n_train] X2_train = X2[:n_train] # train model using ridge regression on box-cox transformed values ttr = TransformedTargetRegressor( regressor=Ridge(alpha=10), transformer=PowerTransformer('box-cox')) ttr.fit(X_train, y) yp1 = ttr.predict(X) if s.name.left < 90: ttr.fit(X2_train, y) yp2 = ttr.predict(X2) # average predictions yp = yp1 * .9 + yp2 * .1 else: yp = yp1 index = range(s.index[0], self.pred_year + 1) sp = pd.Series(yp, index=index) return sp
def test_transform_target_regressor_pass_extra_predict_parameters(): # Checks that predict kwargs are passed to regressor. X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer() ) regr.fit(X, y) regr.predict(X, check_input=False) assert regr.regressor_.predict_called
def test_transform_target_regressor_ensure_y_array(): # check that the target ``y`` passed to the transformer will always be a # numpy array. Similarly, if ``X`` is passed as a list, we check that the # predictor receive as it is. X, y = friedman tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(), regressor=DummyCheckerListRegressor(), check_inverse=False) tt.fit(X.tolist(), y.tolist()) tt.predict(X.tolist()) assert_raises(AssertionError, tt.fit, X, y.tolist()) assert_raises(AssertionError, tt.predict, X)
def test_model_finder_search_and_fit_regression(model_finder_regression, mode, expected_model, expected_scores, seed): """Testing if search_and_fit() function correctly searches for and sets and fits chosen model (regression). Additionally checks if the model is correctly wrapped in TransformedTargetRegressor.""" prediction_array = np.array([ 1.34, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 ]).reshape(1, -1) model_finder_regression._quicksearch_limit = 1 model_finder_regression.scoring_functions = [mean_squared_error, r2_score] actual_model = model_finder_regression.search_and_fit( models=None, scoring=mean_squared_error, mode=mode) expected_model.random_state = seed t_X = model_finder_regression.X t_y = model_finder_regression.y m = TransformedTargetRegressor(regressor=expected_model, transformer=QuantileTransformer( output_distribution="normal", random_state=seed)) m.fit(t_X, t_y) expected_array = m.predict(prediction_array) assert str(actual_model) == str(expected_model) assert str(model_finder_regression._chosen_model) == str(expected_model) assert model_finder_regression._chosen_model_params == expected_model.get_params( ) assert model_finder_regression._chosen_model_scores == expected_scores assert type(actual_model) == WrappedModelRegression assert str(actual_model.clf.regressor) == str(expected_model) assert np.array_equal(model_finder_regression.predict(prediction_array), expected_array)
def run_experiment( exp_name, models, folds, train_seasons, test_seasons, X, y, preprocessor=None, #print_exp_progress=None, calculate_metrics_func=calculate_clf_metrics, algorithm_type='clf'): results = [] names = [] print("Running experiment", exp_name) for name, current_model in models: cv_results = defaultdict(list) for train_idx, test_idx in folds: X_train, X_test = X.loc[train_idx], X.loc[test_idx] #X_train, X_test = utils.scale_X(X, y, train_idx, test_idx) y_train, y_test = y.loc[train_idx], y.loc[test_idx] y_true = y_test pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', current_model)]) if algorithm_type == 'reg': model = TransformedTargetRegressor( regressor=pipeline, transformer=StandardScaler()) else: model = pipeline fit_info = model.fit(X_train, y_train) y_pred = model.predict(X_test) fold_metric_results = calculate_metrics_func(y_true, y_pred) for key, value in fold_metric_results.items(): cv_results[key].append(value) exp_result = { "exp_name": exp_name, "model": name, **agg_metrics(cv_results.keys(), cv_results) } if algorithm_type == 'reg': reg_exp_results.append(exp_result) else: exp_results.append(exp_result) cv_results["model"] = [name] * len(folds) cv_results["season_train"] = train_seasons cv_results["season_test"] = test_seasons results.append(cv_results) names.append(name) print("Done") return names, results
def predict_by_pos(pos, year): features_list = ['g', 'gs', 'mp_per_g', 'fg_per_g', 'fga_per_g', 'fg_pct', 'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct', 'ft_per_g', 'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g', 'pts_per_g', 'tenure', 'height', 'weight', 'sos', 'srs', 'ows', 'dws', 'ws', 'ts_pct', 'usg_pct', 'bpm', 'pprod'] X = df[(df['pos'] == pos) & (df['is_final_year'])] X = X[features_list] X_imp = IterativeImputer(max_iter=10).fit_transform(X) X = pd.DataFrame(X_imp, index=X.index, columns=X.columns) df.loc[X.index, X.columns] = X X = df[(df['is_final_year']) & (df['pos'] == pos) & (df['mp_per_g'] > 15) & (df['g'] > 25)][features_list] #X['per'] = (1/X['mp_per_g']) * ((X['fg_per_g'] * 85.91) + (X['stl_per_g'] * 53.897) + (X['fg3_per_g'] * 51.757) + (X['ft_per_g'] * 46.845) + (X['blk_per_g'] * 39.19) + (X['orb_per_g'] * 39.19) + (X['ast_per_g'] * 34.677) + (X['drb_per_g'] * 14.707) - (X['pf_per_g'] * 17.174) - (X['fta_per_g'] - (X['ft_per_g'])*20.091) - ((X['fga_per_g'] - X['fg_per_g'])*39.19) - (X['tov_per_g']*53.897)) X = (X - X.min()) / (X.max() - X.min()) predicted_to_nba = pd.DataFrame() for yr in range(1996, 2020): a = predict_make_nba(yr, X) predicted_to_nba = predicted_to_nba.append(a) ################################################## ##PER Regression## #train algorithm on players not in given year clf1 = SGDRegressor(alpha=.01, penalty='elasticnet') features_list = X.columns.tolist() #create dataframe of NCAA players that made NBA df2 = predicted_to_nba X2 = transform_train_data(df2[features_list]) y2 = df2[['mean_per']].loc[X2.index] to_drop = list(X2.columns[X2.var() < .1]) to_drop += ['gs'] X2.drop(to_drop, axis=1, inplace=True) X2 = (X2 - X2.mean())/X2.std() X_new_pred = X2[df2.loc[X2.index]['year'] == year] X2 = X2[(df2.loc[X2.index]['year'] != year) & (df2.loc[X2.index]['year'] < 2018) & (df2.loc[X2.index]['year'] > 1995)] y2 = y2.loc[X2.index] y_new_pred = df2[['mean_per']].loc[X_new_pred.index] y_new_pred = (y_new_pred - y2.mean())/y2.std() y2 = (y2 - y2.mean())/y2.std() X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, stratify=df2.loc[y2.index]['tier']) clf2 = TransformedTargetRegressor(clf1) clf2.fit(X2_train, y2_train) #predict per for players in given year X_new_pred = X_new_pred[X2.columns.tolist()] new_pred = clf2.predict(X_new_pred) new_pred_curr_year = pd.DataFrame(new_pred, index=X_new_pred.index).merge(df.iloc[:, :-8], left_index=True, right_index=True) return new_pred_curr_year
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000): from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV from joblib import Memory from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import PowerTransformer # Find alpha range alphas = find_alpha_range(X, y, n_alphas=n_alphas) list_y_pred = [] list_y_true = [] list_models = [] for train_index, test_index in tqdm(cv_outer.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] list_y_true.append(y_test) cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0) tmpfolder = mkdtemp() memory = Memory(location=tmpfolder) pip = make_pipeline(VarianceThreshold(), PCA(), Ridge(max_iter=1e6), memory=memory) grid = GridSearchCV(pip, param_grid={'ridge__alpha': alphas}, cv=cv_inner, n_jobs=-1, scoring="neg_mean_squared_error") regr_trans = TransformedTargetRegressor( regressor=grid, transformer=PowerTransformer(method=transform)) regr_trans.fit(X_train, y_train) list_models.append(regr_trans) y_pred = regr_trans.predict(X_test) list_y_pred.append(y_pred) memory.clear(warn=False) shutil.rmtree(tmpfolder) y_pred = np.concatenate(list_y_pred) y_true = np.concatenate(list_y_true) return y_pred, y_true, list_models
def main(): # Generate random data : create random points, and, keep only a subset of them. x = np.linspace(0, 10, 500) rng = np.random.RandomState(0) rng.shuffle(x) x = np.sort(x[:]) y = f(x) # Create bagging and random forest models. fig, axes = plt.subplots(2, 2, figsize=(20, 10)) models = [AdaBoostRegressor(n_estimators=5, base_estimator=KNeighborsRegressor()), AdaBoostRegressor(n_estimators=5, base_estimator=SVR()), AdaBoostRegressor(n_estimators=5, base_estimator=KernelRidge(kernel='rbf')), GradientBoostingRegressor()] for axis, model in zip(axes.ravel(), models): # Set title. title = model.__class__.__name__ reg_params = model.get_params() if 'base_estimator' in reg_params: # GradientBoostingRegressor has no 'base_estimator'. title += ', estimator: '+reg_params['base_estimator'].__class__.__name__ axis.set_title(title) # Plot random data. axis.plot(x, y, 'o', color='black', markersize=2, label='random data') # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5. x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline([('scale', preprocessing.StandardScaler()), ('model', model)]) # Data scaling applied before / after any operator applied to the model. y_transformer = preprocessing.MinMaxScaler().fit(y.reshape(-1, 1)) treg = TransformedTargetRegressor(regressor=pipe, transformer=y_transformer) # Target scaling applied before / after any operator applied to the model. # Train model. treg.fit(x_augmented, y) # Plot intermediate regression estimations. if isinstance(model, AdaBoostRegressor): for i, tree in enumerate(treg.regressor_['model'].estimators_): x_augmented_scaled = treg.regressor_['scale'].transform(x_augmented) # x input after scaling (as tree does not use Pipeline). y_hat = tree.predict(x_augmented_scaled) # y outcome before scaling (as tree does not use TransformedTargetRegressor). y_pred = y_transformer.inverse_transform(y_hat.reshape(-1, 1)) axis.plot(x, y_pred, '--', label='tree '+str(i)) axis.axis('off') axis.legend() # Plot final regression. axis.plot(x, treg.predict(x_augmented), '-', color='black', label=model.__class__.__name__) axis.axis('off') axis.legend() plt.show()
class SemiSup_RandomizedSearchCV(BaseEstimator): def __init__(self, estimator, param_distributions, n_iter=100, cv=5, scoring=metrics.accuracy_score, pseudo=True): # We initialize our class similar to sklearn randomized search self.estimator = estimator self.scoring = scoring self.pseudo = pseudo self.transformedtargetestimator = TransformedTargetRegressor(regressor=estimator, func=lambda x: x if np.random.rand() > 1/cv else -1, inverse_func=lambda x: x, check_inverse=False) self.scoring = scoring self.sampler = ParameterSampler(param_distributions, n_iter) self.cv_results_ = pd.DataFrame({'mean_test_score': np.empty(shape=[0]), 'std_test_score': np.empty(shape=[0]), 'mean_score_time': np.empty(shape=[0]), 'std_score_time': np.empty(shape=[0]), 'params': None}) self.folds = KFold(n_splits=cv) def fit(self, X, y, sample_weight=None): for params in self.sampler: # Update Parameters self.estimator.set_params(**params) # Reset Scores scores = [] times = [] for train_index, test_index in self.folds.split(X): #Create Semisupervised Sampler self.transformedtargetestimator = TransformedTargetRegressor(regressor=self.estimator, func=lambda x: np.where(np.in1d(x.index,train_index),x,-1), inverse_func=lambda x: x, check_inverse=False) #Fit if self.pseudo: self.transformedtargetestimator.regressor.pseudo_fit = pseudo_fit.__get__(self.transformedtargetestimator.regressor) self.transformedtargetestimator = self.transformedtargetestimator.regressor.pseudo_fit(X, self.transformedtargetestimator.func(y)) else: self.transformedtargetestimator.fit(X, y, sample_weight) #Score score_index = np.in1d(y.index,test_index) start = time() scores.append(self.scoring(y[score_index], self.transformedtargetestimator.predict(X=X[score_index]))) times.append(time()-start) self.cv_results_ = self.cv_results_.append(pd.DataFrame({'mean_test_score': np.mean(scores), 'std_test_score': np.std(scores), 'mean_score_time': np.mean(times), 'std_score_time': np.std(times), 'params': [params]})) self.cv_results_ = self.cv_results_.sort_values('mean_test_score', ascending=False).reset_index(drop=True) return self
class Regressor(BaseEstimator): def __init__(self): self.MYReg = TransformedTargetRegressor( regressor=RandomForestRegressor(n_estimators=30, max_depth=12), func=lambda u: np.log10(np.clip(u, a_min=1, a_max=None)), inverse_func=lambda u: np.power(10, u), check_inverse=False, ) def fit(self, X, y): return self.MYReg.fit(X, y) def predict(self, X): return self.MYReg.predict(X)
class TransformedTargetRegressorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def rf_prediction(self): """ uses ensemble (Random Forest) method to predict crab age :return: """ logger.info("running Random Forest model") X = self.crab_data.drop("age", axis=1) y = self.crab_data[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) # numerical_features = X_train.dtypes == 'float' categorical_features = ~numerical_features # I used pipelining so that the predicted values were automatically transformed/scaled back preprocess = make_column_transformer( (RobustScaler(), numerical_features), (OneHotEncoder(sparse=False), categorical_features)) forest = RandomForestRegressor(n_estimators=5000, max_depth=20, min_samples_leaf=2, min_samples_split=4, random_state=100) f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)]) f_reg_ttr = TransformedTargetRegressor(regressor=f_reg) f_reg_ttr.fit(X_train, y_train) s = f_reg_ttr.score(X_test, y_test) logger.info("R-squared from Random Forest is: {0}".format(s)) y_pred = f_reg_ttr.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("RandomForest MAE: {0}".format(mae)) logger.debug("RandomForest RMSE: {0}".format(mse)) logger.debug("RandomForest R-squared: {0}".format(s)) # recreate the original dataset crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_forest"] = pd.Series(y_pred.ravel()) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_forest.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_forest.csv")) logger.info("Random Forest execution finished")
def ols_prediction(self): """ uses linear regression after standardising to normal dist prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file also creates another column to calculate relative percentage difference between y and predicted y :return: """ logger.info("running Linear Regression model") crab_df_woo = self.pre_process_data() transformer = QuantileTransformer(output_distribution='normal') # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist reg = linear_model.LinearRegression() t_reg = TransformedTargetRegressor(regressor=reg, transformer=transformer) ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True, drop_invariant=True) crab_df_woo_enc = ohe.fit_transform(crab_df_woo) X = crab_df_woo_enc.drop("age", axis=1) y = crab_df_woo_enc[["age"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) t_reg.fit(X_train, y_train) s = t_reg.score(X_test, y_test) logger.info("R-squared from Linear Regression is: {0}".format(s)) y_pred = t_reg.predict(X) mse = np.sqrt(mean_squared_error(y, y_pred)) mae = mean_absolute_error(y, y_pred) logger.debug("Linear Regression MAE: {0}".format(mae)) logger.debug("Linear Regression RMSE: {0}".format(mse)) logger.debug("Linear Regression R-squared: {0}".format(s)) crab_df = X.copy() crab_df["age"] = pd.Series(y.values.ravel()) crab_df["age_ols"] = pd.Series(y_pred.ravel()) crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row), axis=1) crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True) crab_df["percentage_difference"] = np.abs( np.divide( (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100) crab_df.to_csv("crab_predit_ols.csv", index=False) logger.info("Crab data with predicted variables saved: {0}".format( "crab_predit_ols.csv")) logger.info("Linear Regression execution finished")
def create_scatter_df(profile, threshold): """ INPUT: profile: profile dataframe threshold: Threshold to remove data that can be identified as outliers DESCRIPTION: Function to remove outliers from profile dataset and predict the spendings of the customers. OUTPUT: result: DataFrame with predictions and actual values mse: The mean squared error of the predictions """ scaler = MinMaxScaler() prediction_df = profile[[ "age", "income", "memberdays", "gender_F", "gender_M", "overall_spendings" ]] prediction_df[["age", "income", "memberdays"]] = scaler.fit_transform( prediction_df[["age", "income", "memberdays"]]) prediction_df = prediction_df[ prediction_df["overall_spendings"] < threshold] X = prediction_df.drop("overall_spendings", axis=1) y = prediction_df["overall_spendings"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) y_test.reset_index(drop=True, inplace=True) result = pd.concat([y_test, pd.Series(y_pred)], axis=1) result.rename(columns={ 0: "prediction", "overall_spendings": "actual_value" }, inplace=True) mse = mean_squared_error(y_test, y_pred) return result, mse
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000): from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.linear_model import Lasso from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import PowerTransformer y_trans = PowerTransformer(method=transform).fit_transform( y[:, None]).flatten() alphas = find_alpha_range(X, y_trans, n_alphas=n_alphas) list_y_pred = [] list_y_true = [] list_models = [] for train_index, test_index in tqdm(cv_outer.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] list_y_true.append(y_test) cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0) lasso_pcr = LassoPCR(scale=False, cv=cv_inner, n_jobs=-1, alphas=alphas, lasso_kws={'max_iter': 1e6}, scoring="neg_mean_squared_error") regr_trans = TransformedTargetRegressor( regressor=lasso_pcr, transformer=PowerTransformer(method=transform)) regr_trans.fit(X_train, y_train) list_models.append(regr_trans) y_pred = regr_trans.predict(X_test) list_y_pred.append(y_pred) y_pred = np.concatenate(list_y_pred) y_true = np.concatenate(list_y_true) return y_pred, y_true, list_models
def run(): # -- LOAD FILE -- # filename = "Data/JabRef_train.csv" df = pd.read_csv(filename, dtype=object) df.dropna(inplace=True) # # - Set target - # y = df["set_clicked"] # # --- Label Encoder --- # encoder = preprocessing.LabelEncoder() df = df.apply(encoder.fit_transform) # # -- Get relevant columns -- # cor = df.corr() cor_target = abs(cor["set_clicked"]) relevant_features = cor_target[cor_target > 0.01] print(relevant_features.index) # # -- Normal Distribution, Reduce impact of outliers -- # transformer = QuantileTransformer(output_distribution="normal") X = df[relevant_features.index] X = X.drop(["set_clicked"], 1) regressor = LinearRegression() regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred}) print(df) print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}") y = y_test.to_numpy(dtype=int) print(f"F1 Score: {f1_score(y, y_pred)}") df[["Actual", "Predicted"]].to_csv("david_results.csv")
def main(): # Generate random data : create random points, and, keep only a subset of them. x = np.linspace(0, 10, 500) rng = np.random.RandomState(0) rng.shuffle(x) x = np.sort(x[:]) y = f(x) # Plot random data. plt.plot(x, y, 'o', color='black', markersize=2, label='random data') # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5. x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T # Polynomial regression : regression on augmented data. regrs = [] regrs.append((linear_model.LinearRegression(), 'polynomial reg')) regrs.append((neighbors.KNeighborsRegressor(15), '15-NN reg')) for regr in regrs: model, lbl = regr[0], regr[1] # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Train model. treg.fit(x_augmented, y) # Plot regression. plt.plot(x_augmented[:, 0], treg.predict(x_augmented), '-', label=lbl) plt.axis('off') plt.legend() plt.show()
def test_model_finder_predict_X_test_regression(model_finder_regression_fitted, split_dataset_numerical, limit, seed): """Testing if predictions of X_test split from found models are correct (in regression).""" models = [ SVR(**{ "C": 0.1, "tol": 1.0 }), Ridge(**{ "alpha": 0.0001, "random_state": seed }), DecisionTreeRegressor(**{ "max_depth": 10, "criterion": "mae", "random_state": seed }), ] results = [] X_train, X_test, y_train, y_test = split_dataset_numerical transformer = QuantileTransformer(output_distribution="normal", random_state=seed) for model in models: new_model = TransformedTargetRegressor(regressor=model, transformer=transformer) new_model.fit(X_train, y_train) results.append((model, new_model.predict(X_test))) expected_results = results[:limit] actual_results = model_finder_regression_fitted.predictions_X_test(limit) for actual_result, expected_result in zip(actual_results, expected_results): assert str(actual_result[0]) == str(expected_result[0]) assert np.array_equal(actual_result[1], expected_result[1])
# On this plot, we see that for the large True price values, our model tends to # under-estimate the price of the house. Typically, this issue arises when # the target to predict does not follow a normal distribution. In these cases # the model would benefit from target transformation. # %% from sklearn.preprocessing import QuantileTransformer from sklearn.compose import TransformedTargetRegressor model_transformed_target = TransformedTargetRegressor( regressor=model, transformer=QuantileTransformer( n_quantiles=900, output_distribution="normal" ), ) model_transformed_target.fit(X_train, y_train) y_pred = model_transformed_target.predict(X_test) plot_predicted_vs_actual(y_test, y_pred, title="House prices in Ames") # %% [markdown] # Thus, once we transformed the target, we see that we corrected some of the # high values. # # ## Summary # In this notebook, we presented the metrics and plots useful to evaluate and # get insights about models. We both focus on regression and classification # problems. # %%
plt.show() elif len(keys) == 2: g = sns.FacetGrid(cv_data, col=keys[0]) g.map(plt.plot, keys[1], "mean_test_score", marker="o") [plt.setp(ax.texts, text="") for ax in g.axes.flat] g.set_titles(row_template='{row_name}', col_template='{col_name}') plt.subplots_adjust(top=0.8) g.fig.suptitle('Validation Scores') plt.show() plt.close() ###Final evaluation parameters on train, test and val sets #Predicted Y_hat_test = full_pipeline.predict(X_test).flatten() Y_hat_train = full_pipeline.predict(X_train).flatten() #Actual Y_vals_test = np.array(Y_test).flatten() Y_vals_train = np.array(Y_train).flatten() #RMSE rmse_test = mean_squared_error(Y_vals_test, Y_hat_test, squared=False) rmse_train = mean_squared_error(Y_vals_train, Y_hat_train, squared=False) #MAE mae_test = mean_absolute_error(Y_vals_test, Y_hat_test) mae_train = mean_absolute_error(Y_vals_train, Y_hat_train) #R^2 rsquared_test = r2_score(Y_vals_test, Y_hat_test) rsquared_train = r2_score(Y_vals_train, Y_hat_train)
class EMCEB(TadpoleModel): """EMC-EB method, Esther Bron - [email protected] The `train_df*` attributes contain training data optimized for each variable. The `y_train_df*` attributes contain the labels to be used for training by each model, thus corresponding to the matching `train_df` DataFrame. Attributes: diagnosis_model (Pipeline): Model for predicting 'diagnosis' variable adas_model (Pipeline): Model for predicting 'ADAS13' variable ventricles_model (Pipeline): Model for predicting 'ventricles' variable y_diagnosis (pandas.DataFrame): 'Diagnosis' labels train_df_diagnosis (pandas.DataFrame): Training data used for 'diagnosis' model. """ def __init__(self, confidence_intervals=True): # Note to self, to get parameters out: model.diagnosis_model.named_steps['scaler'].mean_ self.diagnosis_model = Pipeline([ ('scaler', StandardScaler()), ('classifier', svm.SVC(kernel='rbf', C=0.5, gamma='auto', class_weight='balanced', probability=True)), ]) adas_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))]) self.adas_model = TransformedTargetRegressor( regressor=adas_pipeline, transformer=StandardScaler()) ventricles_pipeline = Pipeline( steps=[('scaler', StandardScaler()), ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))]) self.ventricles_model = TransformedTargetRegressor( regressor=ventricles_pipeline, transformer=StandardScaler()) self.y_diagnosis = None self.y_adas = None self.y_ventricles = None self.train_df_diagnosis = None self.train_df_adas = None self.train_df_ventricles = None self.confidence_intervals = confidence_intervals self.train_df_processed = None self.test_df_processed = None @staticmethod def preprocess(df: pd.DataFrame, is_train_df: bool): if not is_train_df: # select last row per RID df = df.sort_values(by=['EXAMDATE']) df = df.groupby('RID').tail(1) exam_dates = df['EXAMDATE'] logger.info("Pre-processing") df = df.copy() if 'Diagnosis' not in df.columns: """We want to transform 'DXCHANGE' (a change in diagnosis, in contrast to the previous visits diagnosis) to an actual diagnosis.""" df = df.replace({'DXCHANGE': {4: 2, 5: 3, 6: 3, 7: 1, 8: 2, 9: 1}}) df = df.rename(columns={"DXCHANGE": "Diagnosis"}) # Adds months to age if 'Month_bl' in df.columns: df['AGE'] += df['Month_bl'] / 12. # Remove feature categories based on prior knowledge # If month_bl in dataframe, then it is data set D1D2, not D3 h = list(df) if 'Month_bl' in df.columns: remove_columns = h[1:8] + [h[9]] + h[14:17] + h[45:47] + h[53:73] + h[74:486] + h[832:838] + h[1172:1174] + \ h[1657:1667] + h[1895:1902] + h[1905:] df: pd.DataFrame = df.drop(remove_columns, axis=1) else: remove_columns = [h[1]] + h[7:11] + h[20:37] df: pd.DataFrame = df.drop(remove_columns, 1) h = list(df) logger.info('Forcing Numeric Values') for i in range(5, len(h)): if df[h[i]].dtype != 'float64': df[h[i]] = pd.to_numeric(df[h[i]], errors='coerce') """Sort the DataFrame per patient on age (at time of visit). This allows using observations from the next row/visit to be used as a label for the previous row. (See `set_futures` method.)""" df = df.sort_values(by=['RID', 'AGE']) if 'APOE4' in df.columns: df = df.drop(['EXAMDATE', 'PTGENDER', 'PTEDUCAT', 'APOE4'], axis=1) else: df = df.drop(['EXAMDATE', 'PTGENDER', 'PTEDUCAT'], axis=1) # Ventricles_ICV = Ventricles/ICV_bl. So make sure ICV_bl is not zero to avoid division by zero icv_bl_median = df['ICV_bl'].median() df.loc[df['ICV_bl'] == 0, 'ICV_bl'] = icv_bl_median if 'Ventricles_ICV' not in df.columns: df["Ventricles_ICV"] = df["Ventricles"].values / df["ICV_bl"].values if not is_train_df: return df, exam_dates else: return df def set_data(self, train_df, test_df, train, test): train_df = self.preprocess(train_df, True) test_df, exam_dates = self.preprocess(test_df, False) if test == 'd1d2': """Select features based on EMCEB_features.csv file""" # Drop columns found unimportant by feature importance ranking measure. selected_features = pd.read_csv( Path(__file__).parent / 'EMCEB_features.csv')['feature'].values.tolist() selected_features = selected_features[0:200] selected_features += ['RID', 'Diagnosis', 'Ventricles_ICV', 'AGE'] selected_features = set(selected_features) train_df = train_df.copy()[selected_features] test_df = test_df.copy()[selected_features] if test == 'd3': test_df_copy = test_df.copy() percentage = .50 idx_fewmissing = pd.isnull(test_df).select_dtypes( include=['bool']).sum(axis=0) < percentage * test_df.shape[0] test_df = test_df.loc[:, idx_fewmissing].copy() test_df['RID'] = test_df_copy['RID'] test_df['Diagnosis'] = test_df_copy['Diagnosis'] test_df['Ventricles_ICV'] = test_df_copy['Ventricles_ICV'] train_df = train_df[test_df.columns] # Fill nans by older values train_df = EMCEB.fill_nans_by_older_values(train_df) if (train == 'd1d2') & (test == 'd1d2'): test_df_copy = test_df.copy() # get test set again from filled train set test_df = train_df.groupby('RID').tail(1).copy() # select all records where RID is in d4. test_df = test_df[test_df['RID'].isin( test_df_copy['RID'].unique())] else: test_df = EMCEB.fill_nans_by_older_values(test_df) self.train_df_processed = train_df self.test_df_processed = test_df self.exam_dates = exam_dates @staticmethod def set_futures( train_df, features=['RID', 'Diagnosis', 'ADAS13', 'Ventricles_ICV', 'AGE']): """For each feature in `features` argument, generate a `Future_{feature}` column, that is filled using the next row for each patient""" futures_df = train_df[features].copy() # Set future value based on each row's next row, e.g. shift the column one up for predictor in ["Diagnosis", "ADAS13"]: futures_df["Future_" + predictor] = futures_df[predictor].shift(-1) # For Ventricles we predict change per year rather dan future value for predictor in ['Ventricles_ICV']: futures_df["Future_" + predictor] = futures_df[predictor].shift(-1) Change_Ventricles_ICV = futures_df[predictor].shift( -1) - futures_df[predictor] Change_Age = futures_df['AGE'].shift(-1) - futures_df['AGE'] Change_Age[Change_Age == 0] = np.nan futures_df["ChangePerMonth_" + predictor] = Change_Ventricles_ICV / Change_Age / 12 # Drop each last row per patient futures_df = futures_df.drop( futures_df.groupby('RID').tail(1).index.values) return futures_df @staticmethod def fill_nans_by_older_values(train_df): """Fill nans in feature matrix by older values (ffill), then by newer (bfill)""" df_filled_nans = train_df.groupby('RID').fillna(method='ffill') train_df[df_filled_nans.columns] = df_filled_nans df_filled_nans = train_df.groupby('RID').fillna(method='bfill') train_df[df_filled_nans.columns] = df_filled_nans return train_df def train(self): assert self.train_df_processed is not None, "Data is not yet set. Use set_data to set data first" train_df = self.train_df_processed futures = self.set_futures(train_df) # Not part of `preprocess` because it's needed for the futures. train_df = train_df.drop(['RID', 'AGE'], axis=1) # Fill nans by mean of training set self.train_df_mean = train_df.mean() train_df = train_df.fillna(self.train_df_mean) # Fill left over nans with 0 train_df = train_df.fillna(0) def non_nan_y(_train_df, _y_df): """Drops all rows with a `y` value that is NaN Returns: Tuple containing (`train_df`, `y_df`), without NaNs for `y_df`. """ # indices where the y value is not nan not_nan_idx = _y_df[_y_df.notna()].index # return from both the train dataframe and y the records with these indices return _train_df.loc[not_nan_idx], _y_df[not_nan_idx] self.train_df_diagnosis, self.y_diagnosis = non_nan_y( train_df, futures['Future_Diagnosis']) self.train_df_adas, self.y_adas = non_nan_y(train_df, futures['Future_ADAS13']) self.train_df_ventricles, self.y_ventricles = non_nan_y( train_df, futures['ChangePerMonth_Ventricles_ICV']) logger.info("Training models") self.diagnosis_model.fit(self.train_df_diagnosis, self.y_diagnosis) self.adas_model.fit(self.train_df_adas, self.y_adas) self.ventricles_model.fit(self.train_df_ventricles, self.y_ventricles) def predict(self): assert self.test_df_processed is not None, "Data is not yet set. Use set_data to set data first" logger.info("Predicting") # test_df = self.preprocess(test_series.to_frame().T) test_df = self.test_df_processed rids = test_df['RID'] test_df = test_df.drop(['RID', 'AGE'], axis=1) # Fill nans by mean of training set test_df = test_df.fillna(self.train_df_mean) test_df = test_df.fillna(0) diag_probas = self.diagnosis_model.predict_proba(test_df) adas_prediction = self.adas_model.predict(test_df) ventricles_change_prediction = self.ventricles_model.predict(test_df) if self.confidence_intervals: logger.info("Bootstrap adas") adas_ci = bootstrap(self.adas_model, self.train_df_adas, self.y_adas, test_df) logger.info("Bootstrap ventricles") ventricles_ci = bootstrap(self.ventricles_model, self.train_df_ventricles, self.y_ventricles, test_df) else: adas_ci = ventricles_ci = 0 def add_months_to_str_date(strdate, months=1): return (datetime.strptime(strdate, '%Y-%m-%d') + relativedelta(months=months)).strftime('%Y-%m-%d') df = pd.DataFrame.from_dict({ 'RID': rids, 'month': 1, 'Forecast Date': list( map(lambda x: add_months_to_str_date(x, 1), self.exam_dates.tolist())), 'CN relative probability': diag_probas.T[0], 'MCI relative probability': diag_probas.T[1], 'AD relative probability': diag_probas.T[2], 'ADAS13': adas_prediction, 'ADAS13 50% CI lower': adas_prediction - adas_ci, 'ADAS13 50% CI upper': adas_prediction + adas_ci, 'Ventricles_ICV': test_df['Ventricles_ICV'] + ventricles_change_prediction, 'Ventricles_ICV 50% CI lower': test_df['Ventricles_ICV'] + ventricles_change_prediction - ventricles_ci, 'Ventricles_ICV 50% CI upper': test_df['Ventricles_ICV'] + ventricles_change_prediction + ventricles_ci, }) # copy each row for each month new_df = df.copy() for i in range(2, 12 * 10): df_copy = df.copy() df_copy['month'] = i df_copy['Forecast Date'] = df_copy['Forecast Date'].map( lambda x: add_months_to_str_date(x, i - 1)) df_copy['Ventricles_ICV'] = test_df[ 'Ventricles_ICV'] + ventricles_change_prediction * df_copy[ 'month'] df_copy['Ventricles_ICV 50% CI lower'] = test_df[ 'Ventricles_ICV'] + (ventricles_change_prediction - ventricles_ci) * df_copy['month'] df_copy['Ventricles_ICV 50% CI upper'] = test_df[ 'Ventricles_ICV'] + (ventricles_change_prediction + ventricles_ci) * df_copy['month'] new_df = new_df.append(df_copy) return new_df
RFE(estimator=svm.SVR(kernel='linear', C=1, cache_size=500), step=0.33, n_features_to_select=100), svm.SVR(kernel='linear', C=1, cache_size=500) ) regression = TransformedTargetRegressor(regressor=regression, transformer=StandardScaler()) print('Computing 5-fold cross validation') cross_val_scores = cross_val_score(regression, X_train, y_train, cv=5, verbose=1, scoring='neg_mean_absolute_error') print(cross_val_scores) print('Fitting model to training data') regression.fit(X_train, y_train) print('Running model on test data') predicted = regression.predict(X_test) print('SVM model predictions: {}'.format(predicted)) print('SVM model MAE on test data: {}'.format(mean_absolute_error(y_test, predicted))) avg_test_prediction = sum(predicted)/len(predicted) print('SVM model average predicted accuracy: {}'.format(avg_test_prediction)) plot_test_data(predicted, y_test) print('\nRunning model on challenge data sets\n') challenge_predictions = [] aave_file = os.path.join(THIS_FOLDER, AAVE_DATA) twitter_aave_data = read_data(aave_file, 5, tab_separated=True) print('Read {} AAVE tweets'.format(len(twitter_aave_data)))
# # On this plot, we see that for the large True price values, our model tends to # under-estimate the price of the house. Typically, this issue arises when the # target to predict does not follow a normal distribution. In this case the # model would benefit from target transformation. # %% from sklearn.preprocessing import QuantileTransformer from sklearn.compose import TransformedTargetRegressor transformer = QuantileTransformer(n_quantiles=900, output_distribution="normal") model_transformed_target = TransformedTargetRegressor(regressor=regressor, transformer=transformer) model_transformed_target.fit(data_train, target_train) target_predicted = model_transformed_target.predict(data_test) # %% predicted_actual = { "True values (k$)": target_test, "Predicted values (k$)": target_predicted } predicted_actual = pd.DataFrame(predicted_actual) # %% sns.scatterplot(data=predicted_actual, x="True values (k$)", y="Predicted values (k$)") plt.axline((0, 0), slope=1, color="tab:orange", label="Perfect fit") plt.axis('square') plt.legend()
def main(): # Read raw data. # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality raw_data = pd.read_csv('winequality-white.csv', sep=';') print('raw_data :\n', raw_data.head()) # Extract data from dataset. x = raw_data[raw_data.columns[:-1]].values # Dataset: variables. y = raw_data['quality'].values # Dataset: labels. print('x :\n', x[:5]) print('y :\n', y[:5]) # Split data set into training set and testing set. # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # Change the hyperparameters of the model to find the best one, compare different models (with/without regularization). models = [] models.append((kernel_ridge.KernelRidge(kernel='rbf'), 'reg ridge rbf' )) # We use a gaussian kernel: 'rbf' radial basis function. for idx_model, model_lbl in enumerate(models): model, lbl = model_lbl[0], model_lbl[1] # Train a model. best_rmse, best_g, best_a = float('inf'), 0, 0 worst_rmse, worst_g, worst_a = 0, 0, 0 all_g, all_a, all_rmse = [], [], [] for g in np.logspace(-2, 2, 6): # g coefficient between 10^-2 and 10^2. # Set parameter model. model.set_params(gamma=g) for a in np.logspace(-2, 2, 6): # a coefficient between 10^-2 and 10^2. # Set parameter model. model.set_params(alpha=a) # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Feed the model. treg.fit(x_train, y_train) # Get prediction for positive value y_prob = treg.predict(x_test) # Compute root mean square error. rmse = np.sqrt(mean_squared_error(y_test, y_prob)) # Save best and worst models. all_g.append(g) all_a.append(a) all_rmse.append(rmse) if rmse < best_rmse: best_rmse = rmse best_g = g best_a = a if rmse > worst_rmse: worst_rmse = rmse worst_g = g worst_a = a # Plot random binary classifier. axis = plt.subplot(1, 2, idx_model + 1, projection='3d') axis.set_xlabel('gamma') axis.set_ylabel('alpha') axis.set_zlabel('rms error') axis.scatter3D(all_g, all_a, all_rmse) # Get the best and worst model. axis = plt.subplot(1, 2, idx_model + 2) for g, a in zip([best_g, worst_g], [best_a, worst_a]): model.set_params(gamma=g) model.set_params(alpha=a) # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Feed the model. treg.fit(x_train, y_train) # Get prediction for positive value y_prob = treg.predict(x_test) # Compute root mean square error. rmse = np.sqrt(mean_squared_error(y_test, y_prob)) # Plot true versus predicted score (marker size = number of pairs true/predicted = the bigger, the better). sizes = {} for (yt, yp) in zip(list(y_test), list(y_prob)): if (yt, yp) in sizes.keys(): sizes[(yt, yp)] += 1 else: sizes[(yt, yp)] = 1 keys = sizes.keys() axis.scatter( [k[0] for k in keys], [k[1] for k in keys], s=[ sizes[k] for k in keys ], # marker size = number of pairs (true, predicted) = the bigger, the better. label='alpha %08.3f - gamma %08.3f - RMSE = %0.5f' % (a, g, rmse)) axis.set_xlabel('True score') axis.set_ylabel('Predicted score') axis.set_title('best kernel Ridge Regression') axis.legend() plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.3, hspace=0.3) plt.show()
def main(): # Read raw data. # https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/Parcours_data_scientist/entrainez-un-modele-predictif-lineaire/TP_1_prostate_dataset.txt # https://rafalab.github.io/pages/649/prostate.html raw_data = pd.read_csv('prostate_dataset.txt', delimiter='\t') print('raw_data :\n', raw_data.head()) # Extract data from dataset. x = raw_data[raw_data.columns[1:-3]].values # Dataset: variables. y = raw_data['lpsa'].values # Dataset: labels. print('x :\n', x[:5]) print('y :\n', y[:5]) # Split data set into training set and testing set. # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Change the hyperparameter alpha of the model to find the best one, compare different models (with/without regularization). # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4022441-entrainez-votre-premier-k-nn # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507806-reduisez-le-nombre-de-variables-utilisees-par-votre-modele n_alphas = 100 alphas = np.logspace(-5, 5, n_alphas) # alphas between 10^-5 and 10^5. models = [] models.append((linear_model.LinearRegression(), 'linear reg')) # Baseline to compare to. models.append( (linear_model.Ridge(), 'ridge')) # Compared to LinearRegression: Ridge reduces weights. models.append( (linear_model.Lasso(fit_intercept=False), 'lasso' )) # Compared to LinearRegression: Lasso can cancel some weights. models.append( (linear_model.ElasticNet(), 'elastic net')) # Mixing Ridge (alpha) and Lasso (1. - alpha). error_min, best_alpha, best_model = float('inf'), 0, '' _, all_axis = plt.subplots(2, 4) for model_lbl in models: model, lbl = model_lbl[0], model_lbl[1] # Change the alpha hyperparameter. coefs, errors = [], [] for a in alphas: if 'alpha' in model.get_params(): # LinearRegression has no alpha. model.set_params(alpha=a) # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn pipe = Pipeline( [('scale', preprocessing.StandardScaler()), ('model', model)] ) # Data scaling applied before / after any operator applied to the model. treg = TransformedTargetRegressor( regressor=pipe, transformer=preprocessing.MinMaxScaler() ) # Target scaling applied before / after any operator applied to the model. # Train a model. treg.fit(x_train, y_train) coefs.append(treg.regressor_['model'].coef_ ) # LinearRegression will have always the same coefs. errors.append(np.mean( (treg.predict(x_test) - y_test )**2)) # LinearRegression will have always the same error. # Plot errors. axis = all_axis.ravel()[0] axis.plot(alphas, errors, label=lbl) axis.set_xscale('log') axis.set_xlabel('alpha') axis.set_ylabel('errors') axis.legend() # Save best model / alpha. if np.min(errors) < error_min: error_min = np.min(errors) best_alpha = alphas[np.argmin(errors)] best_model = lbl # Plot weights. nb_coefs = np.shape(coefs)[1] for c in range(nb_coefs): axis = all_axis.ravel()[c + 1] coef = np.array(coefs)[:, c] axis.plot(alphas, coef, label=lbl) axis.set_xscale('log') axis.set_xlabel('alpha') axis.set_ylabel('weights_' + str(c) + ': ' + raw_data.columns[1 + c]) axis.legend() for i in range(8): axis = all_axis.ravel()[i] axis.axvline(best_alpha, label='best: ' + best_model, color='k', ls='--') axis.legend() plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.3, hspace=0.3) plt.show()
class WrappedModelRegression: """Wrapper for Models in Regression problems. Models get wrapped with TransformedTargetRegressor to transform y target before predictions on X features take place. Wrapper additionally customizes __name__, __class__ and __str__ methods/attributes to return those values from main Model (not TransformedTargetRegressor). Attributes: clf (sklearn.compose.TransformedTargetRegressor): Wrapped model for regression problems """ def __init__(self, regressor, transformer): """Create WrappedModelRegression object. Override __name__ and __class__ attributes with appropriate attributes from regressor. Args: regressor (sklearn.Model): Model used to predict regression target transformer (sklearn.Transformer): Transformer used to transform y (target) """ self.clf = TransformedTargetRegressor(regressor=regressor, transformer=transformer) self.__name__ = self.clf.regressor.__class__.__name__ self.__class__ = self.clf.regressor.__class__ def fit(self, *args, **kwargs): """Fit Model in clf attribute with provided arguments. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: self """ self.clf.fit(*args, **kwargs) return self def predict(self, *args, **kwargs): """Predict provided arguments with Model in clf attribute. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: numpy.ndarray: predictions """ return self.clf.predict(*args, **kwargs) def get_params(self, *args, **kwargs): """Return params of regressor inside wrapped clf Model. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: dict: params of regressor """ return self.clf.regressor.get_params(*args, **kwargs) def __str__(self): """Return __str__method of regressor inside wrapped clf Model. Returns: str: __str__ method of regressor """ return self.clf.regressor.__str__() def __class__(self, *args, **kwargs): """Return new object of regressor class instantiated with *args and **kwargs arguments. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: regressor: new regressor object """ return self.clf.regressor.__class__(*args, **kwargs)
ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) f.suptitle("Synthetic data", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) ###############################################################################
def plot_transformed_target(): # !/usr/bin/env python # -*- coding: utf-8 -*- """ ====================================================== Effect of transforming the targets in regression model ====================================================== In this example, we give an overview of the :class:`sklearn.compose.TransformedTargetRegressor`. Two examples illustrate the benefit of transforming the targets before learning a linear regression model. The first example uses synthetic data while the second example is based on the Boston housing data set. """ # Author: Guillaume Lemaitre <*****@*****.**> # License: BSD 3 clause from __future__ import print_function, division import numpy as np import matplotlib import matplotlib.pyplot as plt from distutils.version import LooseVersion print(__doc__) ############################################################################### # Synthetic example ############################################################################### from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.linear_model import RidgeCV from sklearn.compose import TransformedTargetRegressor from sklearn.metrics import median_absolute_error, r2_score # `normed` is being deprecated in favor of `density` in histograms if LooseVersion(matplotlib.__version__) >= '2.1': density_param = {'density': True} else: density_param = {'normed': True} ############################################################################### # A synthetic random regression problem is generated. The targets ``y`` are # modified by: (i) translating all targets such that all entries are # non-negative and (ii) applying an exponential function to obtain non-linear # targets which cannot be fitted using a simple linear model. # # Therefore, a logarithmic (`np.log1p`) and an exponential function # (`np.expm1`) will be used to transform the targets before training a linear # regression model and using it for prediction. X, y = make_regression(n_samples=10000, noise=100, random_state=0) y = np.exp((y + abs(y.min())) / 200) y_trans = np.log1p(y) ############################################################################### # The following illustrate the probability density functions of the target # before and after applying the logarithmic functions. f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, **density_param) ax0.set_xlim([0, 2000]) ax0.set_ylabel('Probability') ax0.set_xlabel('Target') ax0.set_title('Target distribution') ax1.hist(y_trans, bins=100, **density_param) ax1.set_ylabel('Probability') ax1.set_xlabel('Target') ax1.set_title('Transformed target distribution') f.suptitle("Synthetic data", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ############################################################################### # At first, a linear model will be applied on the original targets. Due to the # non-linearity, the model trained will not be precise during the # prediction. Subsequently, a logarithmic function is used to linearize the # targets, allowing better prediction even with a similar linear model as # reported by the median absolute error (MAE). f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) f.suptitle("Synthetic data", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) ############################################################################### # Real-world data set ############################################################################### ############################################################################### # In a similar manner, the boston housing data set is used to show the impact # of transforming the targets before learning a model. In this example, the # targets to be predicted corresponds to the weighted distances to the five # Boston employment centers. from sklearn.datasets import load_boston from sklearn.preprocessing import QuantileTransformer, quantile_transform dataset = load_boston() target = np.array(dataset.feature_names) == "DIS" X = dataset.data[:, np.logical_not(target)] y = dataset.data[:, target].squeeze() y_trans = quantile_transform(dataset.data[:, target], output_distribution='normal').squeeze() ############################################################################### # A :class:`sklearn.preprocessing.QuantileTransformer` is used such that the # targets follows a normal distribution before applying a # :class:`sklearn.linear_model.RidgeCV` model. f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, **density_param) ax0.set_ylabel('Probability') ax0.set_xlabel('Target') ax0.set_title('Target distribution') ax1.hist(y_trans, bins=100, **density_param) ax1.set_ylabel('Probability') ax1.set_xlabel('Target') ax1.set_title('Transformed target distribution') f.suptitle("Boston housing data: distance to employment centers", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) ############################################################################### # The effect of the transformer is weaker than on the synthetic data. However, # the transform induces a decrease of the MAE. f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 10], [0, 10], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text( 1, 9, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 10]) ax0.set_ylim([0, 10]) regr_trans = TransformedTargetRegressor( regressor=RidgeCV(), transformer=QuantileTransformer(output_distribution='normal')) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 10], [0, 10], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text( 1, 9, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 10]) ax1.set_ylim([0, 10]) f.suptitle("Boston housing data: distance to employment centers", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) plt.show()
def runmodels(): # load the data dfTrain = pd.read_csv('train.csv', low_memory=False) dfTest = pd.read_csv('test.csv', low_memory=False) dfStore = pd.read_csv("store.csv", low_memory=False) # dropping the zero sales and closed stores dfTrain = dfTrain[(dfTrain.Open != 0) & (dfTrain.Sales != 0)] sales, holidays = prophetData(dfTrain) # filling the NaN values in CompetitionDistance col dfStore.CompetitionDistance.fillna(dfStore.CompetitionDistance.median(), inplace=True) # replace all the other NaN values with zeros dfStore.fillna(0, inplace=True) # fill the missing values dfTest.fillna(1, inplace=True) # merge train and test dataset with store data dfTrainStore = merge(dfTrain, dfStore) dfTestStore = merge(dfTest, dfStore) # Set the target column Y = dfTrainStore['Sales'] Id = dfTestStore['Id'] # remove dataset specific columns dfTrainStore = dfTrainStore.drop(['Customers', 'Sales'], axis=1) dfTestStore = dfTestStore.drop(['Id'], axis=1) # split the data into a training set and a validation set xTrain, xTrainTest, yTrain, yTrainTest = train_test_split(dfTrainStore, Y, test_size=0.20, random_state=42) pipe = Pipeline(steps=[('multipleTrans', multipleTransformer()), ('randomForest', RandomForestRegressor(n_estimators=128, criterion='mse', max_depth=20, min_samples_split=10, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=35, verbose=0, warm_start=False))]) regModel = TransformedTargetRegressor(regressor=pipe, func=targetTransform, inverse_func=reverseTargetTransform) # training the Regression Model regModel.fit(xTrain, yTrain) # Regression Model prediction yPred = regModel.predict(xTrainTest) # predict on the testStore set predictions = regModel.predict(dfTestStore) # turn the predictions into a dataframe dfPreds = pd.DataFrame({'Id': Id, 'Sales': predictions}) # training the prophet Model pModel = Prophet(interval_width=0.5, holidays=holidays) pModel.fit(sales) # dataframe that extends into future 6 weeks future_dates = pModel.make_future_dataframe(periods=6 * 7) # prophet model predictions forecast = pModel.predict(future_dates) # rename prediction columns and isolate the predictions fc = forecast[['ds', 'yhat']].rename(columns={ 'Date': 'ds', 'Forecast': 'yhat' }) # get the current time and turn it into a string now = datetime.datetime.now().strftime('%d-%m-%Y-%H-%M-%S-%f')[:-3] # Save the model # filenameReg = 'regModel-' + now + '.pkl' # filenamePro = 'pModel-' + now + '.pkl' # pickle.dump(regModel, open(filenameReg, 'wb')) # pickle.dump(pModel, open(filenamePro, 'wb')) return render_template('model.html', labels=dfPreds['Id'], values=dfPreds['Sales'], linelabels=fc['ds'], linevalues=fc['yhat'])
ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) # Transform targets and use same linear model regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) f.suptitle("Synthetic data", y=0.035) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])