def regression_visualization(model, X_train, X_test, y_train, y_test): visualizer = PredictionError(model) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) plt.title('Score visualization') plt.legend() st.pyplot()
def peplot(): X, y = load_concrete() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) oz = PredictionError(Lasso(), ax=newfig()) oz.fit(X_train, y_train) oz.score(X_test, y_test) savefig(oz, "prediction_error")
def prediction_error_plot(lin_model,x_train, y_train, x_test, y_test): fig = plt.figure(figsize=(16,12)) ax1 = fig.add_subplot(111) visualizer_pred_err = PredictionError(lin_model, ax=ax1) visualizer_pred_err.fit(x_train, y_train) # Fit the training data to the visualizer visualizer_pred_err.score(x_test, y_test) # Evaluate the model on the test data visualizer_pred_err.show()
def visualiza_erros(train_x,train_y,test_x,test_y): visualizer = PredictionError(LinearRegression()) visualizer.fit(train_x, train_y) visualizer.score(test_x, test_y) visualizer.poof() visualizer = ResidualsPlot(LinearRegression()) visualizer.fit(train_x, train_y) visualizer.score(test_x, test_y) visualizer.poof()
def log_prediction_error_chart(regressor, X_train, X_test, y_train, y_test, experiment=None): """Log prediction error chart. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfr = RandomForestRegressor() rfr.fit(X_train, y_train) neptune.init('my_workspace/my_project') neptune.create_experiment() log_prediction_error_chart(rfr, X_train, X_test, y_train, y_test) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' exp = _validate_experiment(experiment) try: fig, ax = plt.subplots() visualizer = PredictionError(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() exp.log_image('charts_sklearn', fig, image_name='Prediction Error') plt.close(fig) except Exception as e: print('Did not log prediction error chart. Error: {}'.format(e))
def visualize_prediction_error(self, model_info): model = model_info['model'] X_train = model_info['X_train'] X_test = model_info['X_test'] Y_train = model_info['Y_train'] Y_test = model_info['Y_test'] visualizer = PredictionError(model) visualizer.fit(X_train, Y_train) # Fit the training data to the visualizer visualizer.score(X_test, Y_test) # Evaluate the model on the test data
def regression_sanity_check(model, X_train, X_test, y_train, y_test): fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10)) plt.sca(ax1) visualizer = ResidualsPlot(model, ax=ax1) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) plt.sca(ax2) visualizer2 = PredictionError(model, ax=ax2) visualizer2.fit(X_train, y_train) visualizer2.score(X_test, y_test) visualizer.finalize() visualizer2.poof()
def create_prediction_error_chart(regressor, X_train, X_test, y_train, y_test): """Create prediction error chart. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing Returns: ``neptune.types.File`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils rfr = RandomForestRegressor() rfr.fit(X_train, y_train) run = neptune.init(project='my_workspace/my_project') run['prediction_error'] = npt_utils.create_prediction_error_chart(rfr, X_train, X_test, y_train, y_test) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' chart = None try: fig, ax = plt.subplots() visualizer = PredictionError(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() chart = neptune.types.File.as_image(fig) plt.close(fig) except Exception as e: print('Did not log prediction error chart. Error: {}'.format(e)) return chart
def testFunc9(savepath='Results/bikeshare_Ridge_PredictionError.png'): ''' 基于共享单车数据使用AlphaSelection ''' data = pd.read_csv('fixtures/bikeshare/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] Y = data["riders"] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) visualizer = PredictionError(Ridge(alpha=3.181)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=savepath)
def perror(ax): from sklearn.linear_model import LassoCV from yellowbrick.regressor import PredictionError features = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] splits = load_data('concrete', cols=features, target='strength', tts=True) X_train, X_test, y_train, y_test = splits estimator = LassoCV() visualizer = PredictionError(estimator, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) return visualizer
def prediction_error_plot(self) -> None: """Plot the actual targets from the dataset against the predicted values generated by our model. This allows us to see how much variance is in the model. """ visualizer = PredictionError(self.trained_model) visualizer.fit(self.X_train, self.y_train) # Fit the training data to the visualizer visualizer.score(self.X_test, self.y_test) # Evaluate the model on the test data save_dir = f"{self.plots_dir}/prediction_error_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/prediction_error_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def lasso_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a lasso regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = LassoCV() reg.fit(X_train, y_train) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.show() # Plotting the prediction error visualizer = PredictionError(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Lasso Regression", "R squared": reg.score(X_test, y_test), "RMSE": rmse(y_test, y_pred), "R squared training": reg.score(X_train, y_train), "MAE": mean_absolute_error(y_test, y_pred), }
def test_prepredict_regressor(self): """ Test the prepredict estimator with a prediction error plot """ # Make prepredictions X, y = self.continuous.X, self.continuous.y y_pred = LinearRegression().fit(X.train, y.train).predict(X.test) # Create prepredict estimator with prior predictions estimator = PrePredict(y_pred, REGRESSOR) assert estimator.fit(X.train, y.train) is estimator assert estimator.predict(X.train) is y_pred assert estimator.score(X.test, y.test) == pytest.approx(0.9999983124154966, rel=1e-2) # Test that a visualizer works with the pre-predictions. viz = PredictionError(estimator) viz.fit(X.train, y.train) viz.score(X.test, y.test) viz.finalize() self.assert_images_similar(viz, tol=10.0)
def showError(): # Load the data df = load_data('concrete') feature_names = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names].as_matrix() y = df[target_name].as_matrix() # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the linear model and visualizer lasso = Lasso() visualizer = PredictionError(lasso) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data
def regression(fname="regression.png"): """ Create figures for regression models """ _, axes = plt.subplots(ncols=2, figsize=(18, 6)) alphas = np.logspace(-10, 1, 300) data = load_concrete(split=True) # Plot prediction error in the middle oz = PredictionError(LassoCV(alphas=alphas), ax=axes[0]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Plot residuals on the right oz = ResidualsPlot(RidgeCV(alphas=alphas), ax=axes[1]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
Koefisien yang paling besar dari model adalah GrLivArea sebesar 0.3154, artinya harga rumah sensitif dengan kolom ini. Apabila terjadi peningkatan terhadap nilai GrLivArea, harga rumah akan meningkat lebih tinggi dibandingkan apabila terjadi kenaikan pada feature yang lain dengan kenaikan yang sama. Perhatikan juga terdapat feature dengan nilai koefisien yang negatif (ExterQual_TA dan ExterQual_Fa), artinya apabila feature ini meningkat maka harga rumah akan menjadi lebih turun. ''' ''' #### 2. Residual Plot ''' st.write('') visualizer_residual = ResidualsPlot(model_lr) visualizer_residual.fit(X_train, y_train) visualizer_residual.score(X_test, y_test) visualizer_residual.finalize() st.pyplot() ''' Residual berdistribusi paling banyak pada nilai 0. Akan tetapi, masih terdapat nilai residual yang cukup tinggi. Hal ini menyebabkan distribusi dari residual tidak sepenuhnya normal, tetapi menjadi skew. ''' ''' #### 3. Prediction Error ''' st.write('') visualizer_prediction_error = PredictionError(model_lr) visualizer_prediction_error.fit(X_train, y_train) visualizer_prediction_error.score(X_test, y_test) visualizer_prediction_error.finalize() st.pyplot() ''' Antara garis best fit dengan garis identity tidak begitu jauh, sehingga dapat dikatakan bahwa model yang dibuat optimal. '''
from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split from yellowbrick.regressor import PredictionError if __name__ == '__main__': # Load the regression data set df = pd.read_csv("../../../examples/data/concrete/concrete.csv") feature_names = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names].as_matrix() y = df[target_name].as_matrix() # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the linear model and visualizer lasso = Lasso() visualizer = PredictionError(lasso) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof( outpath="images/prediction_error.png") # Draw/show/poof the data
import pandas as pd from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split from yellowbrick.regressor import PredictionError if __name__ == '__main__': # Load the regression data set df = pd.read_csv("../../../examples/data/concrete/concrete.csv") feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names] y = df[target_name] # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the linear model and visualizer lasso = Lasso() visualizer = PredictionError(lasso) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof(outpath="images/prediction_error.png") # Draw/show/poof the data
# How does our model perform on the test data? score_model(lasso) # What do our residuals look like? from yellowbrick.regressor import ResidualsPlot resplot = ResidualsPlot(lasso) resplot.fit(Xtrain, ytrain) resplot.score(Xtest, ytest) g = resplot.poof() # What does our prediction error look like? from yellowbrick.regressor import PredictionError prederr = PredictionError(lasso) prederr.fit(Xtrain, ytrain) prederr.score(Xtrain, ytrain) g = prederr.poof() # Next, we pull out our fitted values (yhat) and actuals (ytest) to see how they compare. # We also calculate our residuals by subtracting our fitted values from the actuals. import matplotlib.pyplot as plt lasso.fit(Xtrain, ytrain) yhat = lasso.predict(Xtest) resid = ytest - yhat data = pd.DataFrame({ 't': range(1, len(yhat) + 1), 'ytest': ytest,
x_train, x_test, y_train, y_test= train_test_split(x,y, random_state=1) lm5 = LinearRegression().fit(x_train,y_train) lm5_pred=lm5.predict(x_test) print("RMSE = ", np.sqrt(mean_squared_error(y_test,lm5_pred))) print("R^2 = ", r2_score(y_test,lm5_pred)) # In[30]: from yellowbrick.regressor import PredictionError, ResidualsPlot visualizer=PredictionError(lm5).fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.show() # In[32]: #TASK 7: INTERACTION EFFECT - SYNERGY advert['interaction']= advert['TV'] * advert['radio'] x=advert[['TV', 'radio', 'interaction']] y=advert.sales x_train, x_test, y_train, y_test= train_test_split(x,y, random_state=1)
# Model building # Lasso regressor = Lasso(alpha=0.005, random_state=0) regressor.fit(X_train, y_train) prediction_Lasso = regressor.predict( scaler.transform(np.array(values_topredict))) # Random Forest Regressor regressor1 = RandomForestRegressor(n_estimators=300, random_state=0) regressor1.fit(X_train, y_train) prediction_RFR = regressor1.predict( scaler.transform(np.array(values_topredict))) visualiser = PredictionError(regressor) visualiser.fit(X_train, y_train) visualiser.score(X_test, y_test) visualiser.poof() visualiser1 = PredictionError(regressor1) visualiser1.fit(X_train, y_train) visualiser1.score(X_test, y_test) visualiser1.poof() y_pred1 = regressor1.predict(X_test) importance = pd.Series(np.abs(regressor.coef_.ravel())) importance.index = df.columns.values.tolist()[:20] importance.sort_values(inplace=True, ascending=False) importance.plot.bar() plt.ylabel('Lasso Coefficients') plt.title('Feature Importance')
def predict(): filename = request.form['name'] regressor = pickle.load(open(filename, 'rb')) temp_array = list() if request.method == 'POST': batting_team = request.form['batting-team'] if batting_team == 'Chennai Super Kings': temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0] elif batting_team == 'Delhi Daredevils': temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0] elif batting_team == 'Kings XI Punjab': temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0] elif batting_team == 'Kolkata Knight Riders': temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0] elif batting_team == 'Mumbai Indians': temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0] elif batting_team == 'Rajasthan Royals': temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0] elif batting_team == 'Royal Challengers Bangalore': temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0] elif batting_team == 'Sunrisers Hyderabad': temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1] bowling_team = request.form['bowling-team'] if bowling_team == 'Chennai Super Kings': temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0] elif bowling_team == 'Delhi Daredevils': temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0] elif bowling_team == 'Kings XI Punjab': temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0] elif bowling_team == 'Kolkata Knight Riders': temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0] elif bowling_team == 'Mumbai Indians': temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0] elif bowling_team == 'Rajasthan Royals': temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0] elif bowling_team == 'Royal Challengers Bangalore': temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0] elif bowling_team == 'Sunrisers Hyderabad': temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1] overs = float(request.form['overs']) runs = int(request.form['runs']) wickets = int(request.form['wickets']) runs_in_prev_5 = int(request.form['runs_in_prev_5']) wickets_in_prev_5 = int(request.form['wickets_in_prev_5']) temp_array = temp_array + [overs, runs, wickets, runs_in_prev_5, wickets_in_prev_5] data = np.array([temp_array]) my_prediction = int(regressor.predict(data)[0]) model = regressor visualizer_pe = PredictionError(model) visualizer_pe.fit(X_train, y_train) visualizer_pe.score(X_test, y_test) vpe = visualizer_pe.poof() return render_template('prediction.html', lower_limit=my_prediction-10, upper_limit=my_prediction+5, vpe=vpe)
df.plot.scatter(x='H6', y='H5', c='DarkBlue') #Bước 7: Phân tích hồi quy Linear Regression train_df, test_df = train_test_split(df, test_size=0.2, random_state=1) y_train = np.array(train_df.H5) X_train = np.array(train_df.H6) X_train = X_train.reshape(X_train.shape[0], 1) y_test = np.array(test_df.H5) X_test = np.array(test_df.H6) X_test = X_test.reshape(X_test.shape[0], 1) model1 = LinearRegression() visualizer = PredictionError(model1) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_train, y_train) #valuate the model on the test data visualizer.show() #Bước 8 Đánh giá mô hình print('Coefficients: ', model1.coef_) print('Score_train: {}'.format(model1.score(X_train, y_train))) print('Score_test: {}'.format(model1.score(X_test, y_test))) # plot for residual error ## setting plot style plt.style.use('fivethirtyeight') ## plotting residual errors in training data plt.scatter(model1.predict(X_train), model1.predict(X_train) - y_train, color="green", s=10,
from sklearn.linear_model import Ridge from yellowbrick.regressor import PredictionError import bikeshare visualizer = PredictionError(Ridge(alpha=3.181)) visualizer.fit(bikeshare.X_train, bikeshare.y_train) visualizer.score(bikeshare.X_test, bikeshare.y_test) visualizer.poof()
def prediction_error_plot(self): visualizer = PredictionError(self.pipe) visualizer.score(self.X_test, self.y_test) return visualizer.show()
score_model(rf) score_model(rf_random) score_model(rf_best) # What do our residuals look like? from yellowbrick.regressor import ResidualsPlot resplot = ResidualsPlot(rf_best) resplot.fit(Xtrain, ytrain) resplot.score(Xtest, ytest) g = resplot.poof() # What does our prediction error look like? from yellowbrick.regressor import PredictionError prederr = PredictionError(rf_best) prederr.fit(Xtrain, ytrain) prederr.score(Xtest, ytest) g = prederr.poof() # Next, we pull out our fitted values (yhat) and actuals (ytest) to see how they compare. # We also calculate our residuals by subtracting our fitted values from the actuals. import matplotlib.pyplot as plt rf_best.fit(Xtrain, ytrain) yhat = rf_best.predict(Xtest) error = ytest - yhat data = pd.DataFrame({ 't': range(1, len(yhat) + 1), 'ytest': ytest,
final_s_gbr = sum(acc_gbr) / len(acc_gbr) acc_train_gbr = [] for i in range(0, len(y_pred_train_gbr)): acc_train_gbr.append(abs(y_pred_train_gbr[i] - Y_train[i]) / Y_train[i]) final_s_train_gbr = sum(acc_train_gbr) / len(acc_train_gbr) final_acc_gbr = (1 - final_s_train_gbr) * 100 print("Accuracy of GradientBoostRegression is") print(final_acc_gbr) print("The mean absolute error of GradientBoost ") mae_gbr = mean_absolute_error(Y_test, y_pred_gbr) print(mae_gbr) model = Lasso() visualizer1 = PredictionError(modelgb) visualizer1.fit(X_train, Y_train) # Fit the training data to the visualizer visualizer1.score(X_test, Y_test) # Evaluate the model on the test data g = visualizer1.poof() from sklearn.ensemble import RandomForestRegressor rfregressor = RandomForestRegressor(n_estimators=100, random_state=0) modelrfr = rfregressor.fit(X_train, Y_train) y_pred_rfr = rfregressor.predict(X_test) y_pred_train_rfr = rfregressor.predict(X_train) y_pred_train_rfr = y_pred_train_rfr.tolist() acc_rfr = [] for i in range(0, len(y_pred_rfr)): acc_rfr.append(abs(y_pred_rfr[i] - Y_test[i]) / Y_test[i]) final_s_rfr = sum(acc_rfr) / len(acc_rfr) acc_train_rfr = []
advert.columns = columns # advert.head() # advert.info() col = columns[1:] # sns.pairplot(advert, x_vars=col, y_vars='线路价格(不含税)', height=14, aspect=0.7) X = advert[col] y = advert['线路总成本'] lm1 = LinearRegression() lm1.fit(X, y) lm1_predict = lm1.predict(X[col]) xtrain,xtest,ytrain,ytest = train_test_split(X,y,random_state=1) # print("R^2:",r2_score(y,lm1_predict)) # 高因素影响 R^2: 0.9797304791768885 lm2 = LinearRegression().fit(xtrain,ytrain) lm2_predict = lm2.predict(xtest) print("RMSE2:",np.sqrt(mean_squared_error(ytest, lm2_predict))) print("R^2 lm2:",r2_score(ytest,lm2_predict)) print(lm2.intercept_) print(lm2.coef_) # R^2: 0.9797304791768885 # RMSE: 535.8592414949177 visualizer = PredictionError(lm1).fit(xtrain,ytrain) visualizer.score(xtest,ytest) visualizer.poof() # sns.heatmap(advert.corr(),cmap="YlGnBu",annot=True) # plt.show() print("R^2 lm1:",r2_score(y,lm1_predict)) print(lm1.intercept_) print(lm1.coef_) # plt.show()
def evaluate_results_time_series(df, time_period_col, model, target, path_to_save_report, max_features=None, plot_since_period=0): mean_error = [] with PdfPages(path_to_save_report) as pdf: for period in range(df[time_period_col].min() + 1, df[time_period_col].max() + 1): train = df[df.time_period < period] test = df[df.time_period == period] X_train, X_test = train.drop(target, 1), test.drop(target, 1) y_train, y_test = train[target], test[target] #model.fit(X_train, y_train) y_pred = model.predict(X_test) error = rmse(y_test, y_pred) mean_error.append(error) if period >= plot_since_period: fig = plt.figure(figsize=(22, 5)) title = fig.suptitle( 'Period {} - Error {} - Train size: {} / Test size: {}'. format(period, round(error, 5), len(y_train), len(y_test)), fontsize=14) fig.subplots_adjust(top=0.85, wspace=0.1) ax1 = fig.add_subplot(1, 2, 1) visualizer = PredictionError(model, ax=ax1, line_color="red") visualizer.score(X_test, y_test) visualizer.finalize() ax2 = fig.add_subplot(1, 2, 2) visualizer = ResidualsPlot(model, ax=ax2) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() #ax3 = fig.add_subplot(1,3,3) #visualize.plot_coefficients(model, X_train) # plt.show() pdf.savefig(fig) plt.close() _logger.info('Period %d - Error %.5f' % (period, error)) else: _logger.info('Period %d - Error %.5f' % (period, error)) _logger.info('Mean Error = %.5f' % np.mean(mean_error)) return model, X_train, y_train, X_test, y_test, mean_error
def scikit_learn_method(x, y, min_x, max_x, max_y, ln_bool, df=all_scopus, test_size=0.2, random_state=0): # https://stackoverflow.com/questions/42988348/typeerror-cannot-convert-the-series-to-class-float if ln_bool: y = np.log(y) # set random_state = 0 for consistent seed x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # reshape (-1, 1) - gives us 1 sample; no need to reshape y # https://datatofish.com/dropna/ # https://stackoverflow.com/questions/18691084/what-does-1-mean-in-numpy-reshape # https://stackoverflow.com/questions/53723928/attributeerror-series-object-has-no-attribute-reshape # https://stackoverflow.com/questions/35082140/preprocessing-in-scikit-learn-single-sample-depreciation-warning x_train = x_train.values.reshape(-1, 1) x_test = x_test.values.reshape(-1, 1) model_withOutliers = LinearRegression() model_withOutliers = model_withOutliers.fit(x_train, y_train) print('y-hat = %sx + %s' % (model_withOutliers.coef_[0], model_withOutliers.intercept_)) # https://stackoverflow.com/questions/41635448/how-can-i-draw-scatter-trend-line-on-matplot-python-pandas/41635626 from sklearn.metrics import r2_score plt.scatter(x, y) # with outliers plt.title('With outliers') m, b = model_withOutliers.coef_[0], model_withOutliers.intercept_ plt.plot(x, m * x + b) plt.show() text = f"$y={m:0.3f}\;x{b:+0.3f}$\n$R^2 = {r2_score(y, m * x + b):0.3f}$" plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=14, verticalalignment='bottom') # https://www.scikit-yb.org/en/latest/api/regressor/peplot.html from sklearn.linear_model import Lasso from yellowbrick.regressor import PredictionError lasso_model = Lasso() visualizer = PredictionError(lasso_model) visualizer.fit(x_train, y_train) # Fit the training data to the visualizer visualizer.score(x_test, y_test) # Evaluate the model on the test data visualizer.show() # https://stackoverflow.com/questions/28876243/how-to-delete-the-current-row-in-pandas-dataframe-during-df-iterrows plt.xlim(min_x, max_x) # without outliers plt.ylim(0, max_y) plt.title('Without outliers') plt.scatter(x, y) plt.show() text = f"$y={m:0.3f}\;x{b:+0.3f}$\n$R^2 = {r2_score(y, m*x+b):0.3f}$" plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=14, verticalalignment='bottom') y_pred_with_outliers = model_withOutliers.predict(x_test) sum_outliers = 0 for i in range(len(df)): squared_with_outliers = (y_test - y_pred_with_outliers)**2 sum_outliers += squared_with_outliers mean = sum_outliers / len(df) rms = mean**0.5 rms_value = 0 for element in rms: rms_value += element rms_value = rms_value / len(rms) print('Root mean squared, with outliers:', rms_value)