def residual_plot(self, x_test=None, y_test=None): if self.standardize: x_test = self.standardizescaler.transform(x_test) try: self.residual_visualizer = ResidualsPlot(self.regressor) except yellowbrick.exceptions.YellowbrickTypeError: self.residual_visualizer = ResidualsPlot(self.regressor.regressor) self.residual_visualizer.fit(self.x_train, self.y_train) if x_test is not None and y_test is not None: self.residual_visualizer.score(x_test, y_test) self.residual_visualizer.poof()
def residual_plot(model_properties=None, output_path=None): ''' Method that shows the residual plot of the trained model ''' if model_properties is None or output_path is None: raise ValueError('Need Model properties and Output path as arguments !') estimator = model_properties['estimator'] X_train = model_properties['X_train'] y_train = model_properties['y_train'] X_validation = model_properties['X_validation'] y_validation = model_properties['y_validation'] config_map = model_properties['config_map'] X_scaler = model_properties['X_scaler'] y_scaler = model_properties['y_scaler'] X_train[config_map['scale_columns']] = X_scaler.transform( X_train[config_map['scale_columns']]) y_train[config_map['label']] = y_scaler.transform( y_train[config_map['label']]) X_validation[config_map['scale_columns']] = X_scaler.transform( X_validation[config_map['scale_columns']]) y_validation[config_map['label']] = y_scaler.transform( y_validation[config_map['label']]) visualizer = ResidualsPlot(estimator) visualizer.fit(X_train.values, y_train.values) visualizer.score(X_validation.values, y_validation.values) visualizer.poof(outpath=os.path.join(output_path, 'residual_plot.png')) return None
def residuals(): X, y = load_concrete() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) oz = ResidualsPlot(Ridge(), ax=newfig()) oz.fit(X_train, y_train) oz.score(X_test, y_test) savefig(oz, "residuals")
def train_model(rf, healed_data, target_string): #rf.fit(healed_data["train_features"], healed_data["train_target"]) model = Ridge() visualizer = ResidualsPlot(rf) try: visualizer.fit(healed_data["train_features"], healed_data["train_target"]) except Exception as e: st.error("Fit error: " + str(e)) try: visualizer.score(healed_data["test_features"], healed_data["test_target"]) except Exception as e: st.error("Score error: " + str(e)) visualizer.show() # st.write(visualizer) st.pyplot(plt.savefig("models/rf_reg_eval_" + target_string + ".png")) # save model output model_output_loc = "models/rf_reg_" + target_string + "_rf_reg_model.pkl" model_output = open(model_output_loc, "wb") pickle.dump(rf, model_output) model_output.close() print("saving model to: " + model_output_loc) return
def uniRegression(p, xLabel, yLabel): global image_num # Randomly shuffle rows p = p.sample(frac=1).reset_index(drop=True) # Split train and test twentyPercent = -1 * round(p.shape[0] * 0.2) xCol = p[xLabel].values.reshape(-1, 1) X_train = xCol[:twentyPercent] X_test = xCol[twentyPercent:] y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1) y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1) # Fit linear regression model lr = linear_model.LinearRegression() lr.fit(X_train, y_train) # Make predictions predicted = lr.predict(X_test) r2 = r2_score(y_test, predicted) mse = mean_squared_error(y_test, predicted) # Plot expected vs. predicted plt.scatter(X_test, y_test, color='black') plt.plot(X_test, predicted, color='blue', linewidth=2) plt.xlabel(xLabel) plt.ylabel(yLabel) plt.show() plt.savefig(image_path.format(image_num), bbox_inches='tight') image_num += 1 print("R2 = ", r2) print("MSE = ", mse) visualizer = ResidualsPlot(lr) # Plot residuals visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def linregress(*args): #import dependencies import sklearn as sk from sklearn.linear_model import LogisticRegression model = LogisticRegression() from sklearn import feature_selection import statsmodels.api as sm from patsy import dmatrices import numpy as np #define arguments dataframe = args[0] y = args[1] xvars = [] for i in range(2, len(args)): xvars.append(args[i]) x = dataframe[[item for item in xvars]] y = dataframe[y] #fit the model model.fit(x, y) #Generate Fit Statistics ##prep data for patsy list = [] for item in xvars: list.append(f' + {item}') string = "".join(list) newstring = string[3:] ind = args[1] ind = ind.strip('"') ##Fit the Model Y, X = dmatrices(f"{ind} ~ {newstring}", data=dataframe, return_type="dataframe") logit = sm.Logit(Y, X) logit_result = logit.fit() #Print Log Odds print("LOG ODDS") print(logit_result.summary()) print(np.exp(logit_result.params)) #Plot the Residuals print("\n Residual Plot") from sklearn.linear_model import Ridge from yellowbrick.datasets import load_concrete from yellowbrick.regressor import ResidualsPlot model = Ridge() visualizer = ResidualsPlot(model, hist=True) y2 = y.values.reshape(-1, 1) visualizer.fit(x, y2) # Fit the training data to the visualizer visualizer.score(x, y2) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) _, ax = plt.subplots() visualizer = ResidualsPlot(model, ax=ax, **kwargs) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=outpath)
def vis_residuals(model, features, target): ''' ''' vis_residuals = ResidualsPlot(model, size=(1080, 720)) vis_residuals.fit(features, target) vis = vis_residuals.poof() vis return vis
def residual_plot(lin_model,x_train, y_train, x_test, y_test): fig = plt.figure(figsize=(16,12)) ax = fig.add_subplot(111) visualizer = ResidualsPlot(lin_model, ax=ax) fig = plt.figure(figsize=(16,12)) visualizer.fit(x_train, y_train) # Fit the training data to the visualizer visualizer.score(x_test, y_test) # Evaluate the model on the test data visualizer.show()
def residuals_yellowbrick(predictors, target): """Returns a residuals vs. fitted graph with a histogram. Not currently functional. For future development. uses yellowbrick, which makes good graphs, but experiencing an unexplained missing argument TypeError """ lm = LinearRegression visualizer = ResidualsPlot(lm) visualizer.fit(predictors, target) return visualizer
def residuals_plot(model, X_test, y_test, road): """ param model : 已训练好的模型 X_test : 测试集数据 y_test : 测试集标签 """ visualizer = ResidualsPlot(model) visualizer.score(X_test, y_test) visualizer.poof(road)
def visualiza_erros(train_x,train_y,test_x,test_y): visualizer = PredictionError(LinearRegression()) visualizer.fit(train_x, train_y) visualizer.score(test_x, test_y) visualizer.poof() visualizer = ResidualsPlot(LinearRegression()) visualizer.fit(train_x, train_y) visualizer.score(test_x, test_y) visualizer.poof()
def log_residuals_chart(regressor, X_train, X_test, y_train, y_test, experiment=None): """Log residuals chart. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfr = RandomForestRegressor() rfr.fit(X_train, y_train) neptune.init('my_workspace/my_project') exp = neptune.create_experiment() log_residuals_chart(rfr, X_train, X_test, y_train, y_test, experiment=exp) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' exp = _validate_experiment(experiment) try: fig, ax = plt.subplots() visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() exp.log_image('charts_sklearn', fig, image_name='Residuals Plot') plt.close(fig) except Exception as e: print('Did not log residuals chart. Error: {}'.format(e))
def regression_sanity_check(model, X_train, X_test, y_train, y_test): fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10)) plt.sca(ax1) visualizer = ResidualsPlot(model, ax=ax1) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) plt.sca(ax2) visualizer2 = PredictionError(model, ax=ax2) visualizer2.fit(X_train, y_train) visualizer2.score(X_test, y_test) visualizer.finalize() visualizer2.poof()
def visualize_residuals_plot(self, model_info): model = model_info['model'] X_train = model_info['X_train'] X_test = model_info['X_test'] Y_train = model_info['Y_train'] Y_test = model_info['Y_test'] visualizer = ResidualsPlot(model) visualizer.fit(X_train, Y_train) # Fit the training data to the model visualizer.score(X_test, Y_test) # Evaluate the model on the test data visualizer.poof() # Draw/show/poof the data
def test_for_homoscedasticity(X_train, y_train, X_test, y_test): """ Plot the data and check for homoscedasticity. Arguments: X_train (dataframe): examples in the training set X_test (dataframe): examples in the test set y_train (dataframe): target in the training set y_train (dataframe): target in the test set """ lr = LinearRegression() visualizer = ResidualsPlot(lr) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) #there should be no clear pattern visualizer.poof()
def testFunc7(savepath='Results/bikeshare_LinearRegression_ResidualsPlot.png'): ''' 基于共享单车数据使用线性回归模型预测 ''' data = pd.read_csv('fixtures/bikeshare/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] Y = data["riders"] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) visualizer = ResidualsPlot(LinearRegression()) visualizer.fit(X_test, y_test) visualizer.poof(outpath=savepath)
def plotResidualsAgainstHoldout(df, holdOut_df, task, seed, schema): X_train = df[COLUMNS.get(task)].values X_test = holdOut_df[COLUMNS.get(task)].values y_train = df[TARGETS.get(task)].values y_test = holdOut_df[TARGETS.get(task)].values # Instantiate the linear model and visualizer wrapped_model = LinearRegression() visualizer = ResidualsPlot(wrapped_model, title="Residuals for schema {}".format(schema)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show(outpath="figs/residuals_{}_seed{}_{}.png".format(task, seed, schema)) plt.close()
def create_residuals_chart(regressor, X_train, X_test, y_train, y_test): """Create residuals chart. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing Returns: ``neptune.types.File`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils rfr = RandomForestRegressor() rfr.fit(X_train, y_train) run = neptune.init(project='my_workspace/my_project') run['visuals/residuals'] = npt_utils.create_residuals_chart(rfr, X_train, X_test, y_train, y_test) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' chart = None try: fig, ax = plt.subplots() visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() chart = neptune.types.File.as_image(fig) plt.close(fig) except Exception as e: print('Did not log residuals chart. Error: {}'.format(e)) return chart
def residuals(ax): from sklearn.linear_model import RidgeCV from yellowbrick.regressor import ResidualsPlot features = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] splits = load_data('concrete', cols=features, target='strength', tts=True) X_train, X_test, y_train, y_test = splits estimator = RidgeCV() visualizer = ResidualsPlot(estimator, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) return visualizer
def residuals_plot(self) -> None: """Plot the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction""" visualizer = ResidualsPlot(self.trained_model) visualizer.fit(self.X_train, self.y_train) # Fit the training data to the visualizer visualizer.score(self.X_test, self.y_test) # Evaluate the model on the test data save_dir = f"{self.plots_dir}/residuals_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/residuals_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def lasso_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a lasso regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = LassoCV() reg.fit(X_train, y_train) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.show() # Plotting the prediction error visualizer = PredictionError(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Lasso Regression", "R squared": reg.score(X_test, y_test), "RMSE": rmse(y_test, y_pred), "R squared training": reg.score(X_train, y_train), "MAE": mean_absolute_error(y_test, y_pred), }
def visualize_pred_residuals(X_train, X_test, y_train, y_test): model = linear_model.Ridge(alpha=0.05) fitted = model.fit(X_train, y_train) visualizer = ResidualsPlot(fitted, size=(1080, 720)) pred = fitted.predict(X_test) r = stats.linregress(pred, y_test) print(r[2]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() cvr = model_selection.cross_validate(model, X_test, y_test, cv=10, return_train_score=True) print('Training scores:', cvr['train_score'], '\n') print('Testing scores:', cvr['test_score'])
def my_residual_plot(X_train, y_train, X_test, y_test): plt.figure(figsize=(20, 5)) plt.grid(True) visualizer = ResidualsPlot(LinearRegression(), hist=False) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data ticks = np.arange(1000, max(y_test.values) + 1, 500) plt.title("Wykres rezyduów", fontsize=25) plt.xlabel("Ceny mieszkań", fontsize=15) plt.ylabel("Rezydua", fontsize=15) plt.plot(ticks, np.zeros(len(ticks)), "r") plt.legend() plt.show()
def generate_ordinal_diagnostics(x, y, current_best_model, label_type, diagnostic_image_path): x = np.array(x) y = np.array(y) kf = KFold(n_splits=10, shuffle=True) guesses = [] for train_index, test_index in kf.split(x): X_train, X_test = x[train_index], x[test_index] y_train, y_test = np.array(y)[train_index], np.array(y)[test_index] model = current_best_model[0].fit(X_train, y_train) for guess in zip(y_test.tolist(), model.predict(X_test).tolist()): guesses.append(guess) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) if "VotingClassifier" not in str(current_best_model[0].__class__): visualizer = ResidualsPlot(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/residuals_plot.png") plt.clf() visualizer = PredictionError(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/prediction_error.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2) visualizer.fit_transform(x, y) print(diagnostic_image_path + "/pca_2.png") visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3) visualizer.fit_transform(x, y) visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png") plt.clf() return { "mse": mean_squared_error(*np.array(guesses).transpose()), "r2": r2_score(*np.array(guesses).transpose()), "mae": median_absolute_error(*np.array(guesses).transpose()), "evs": explained_variance_score(*np.array(guesses).transpose()), "rmse": np.sqrt(mean_squared_error(*np.array(guesses).transpose())) }
def showResiduals(): # Load the data df = load_data('concrete') feature_names = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names].as_matrix() y = df[target_name].as_matrix() # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the linear model and visualizer ridge = Ridge() visualizer = ResidualsPlot(ridge) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data
def slr(self, iv, dv, plot_relationship=False, plot_residuals=True): # Create simple linear regression model self.slr_model = LinearRegression(fit_intercept=True) y = self.data[dv] x = self.data[iv] self.slr_model.fit(x[:, np.newaxis], y) xfit = np.linspace(-4, 4, 1000) yfit = self.slr_model.predict(xfit[:, np.newaxis]) if plot_relationship: sns.lmplot(x=iv, y=dv, data=self.data, height=7, aspect=1.25) plt.plot(xfit, yfit) plt.ylabel(dv) plt.xlabel(iv) plt.title("{} = {} • {} + {}".format(dv, round(self.slr_model.coef_[0], 5), iv, round(self.slr_model.intercept_, 5))) plt.subplots_adjust(left=.095, right=.95, top=.9, bottom=.15) plt.xlim(-100, max(self.data["Counts"])*1.1) if plot_residuals: from yellowbrick.regressor import ResidualsPlot # Instantiate the linear model and visualizer visualizer = ResidualsPlot(model=self.slr_model) visualizer.fit(x[:, np.newaxis], y) # Fit the training data to the model visualizer.poof() print("Simple Linear Regression\n{} = {} • {} + {}".format(dv, round(self.slr_model.coef_[0], 5), iv, round(self.slr_model.intercept_, 5))) # Predicts RMSE y_predict = self.slr_model.predict(x.values.reshape(-1, 1)) rmse = sqrt(((y - y_predict) ** 2).values.mean()) self.df_rmse.loc["Linear"] = round(rmse, 5) print("\n", self.df_rmse)
def regression(fname="regression.png"): """ Create figures for regression models """ _, axes = plt.subplots(ncols=2, figsize=(18, 6)) alphas = np.logspace(-10, 1, 300) data = load_concrete(split=True) # Plot prediction error in the middle oz = PredictionError(LassoCV(alphas=alphas), ax=axes[0]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Plot residuals on the right oz = ResidualsPlot(RidgeCV(alphas=alphas), ax=axes[1]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def main(): data = pd.read_csv('plano-saude.csv') # .values transform to a numpy array x = data.iloc[:, 0].values y = data.iloc[:, 1].values corr_coef = np.corrcoef(x, y) # algoritmos no scikit learn necessitam estar no formato de matriz x = x.reshape(-1, 1) regression = LinearRegression() # realizando o treinamento regression.fit(x, y) # b0 regression.intercept_ # b1 regression.coef_ plt.scatter(x, y) plt.plot(x, regression.predict(x), color='red') plt.title('Regressão linear simples') plt.xlabel('Idade') plt.ylabel('Custo') value = [40] value = np.asarray(value) value = value.reshape(-1, 1) prevision1 = regression.predict(value) # y = b0 + b1 * x1 prevision2 = regression.intercept_ + regression.coef_ * value # verificando a pontuacao do algoritmo de regressão score = regression.score(x, y) # plotando um grafico para melhor visualizacao dos dados. visualizer = ResidualsPlot(regression) visualizer.fit(x, y) # Train R² é a mesma coisa que regression.score visualizer.poof()
pred = ridgeReg.predict(X_test) #calculating mse mse = np.mean((pred - y_test)**2) mse ## calculating score ridgeReg.score(X_test,y_test) from yellowbrick.regressor import ResidualsPlot # Instantiate the linear model and visualizer ridge = Ridge() visualizer = ResidualsPlot(ridge) visualizer.fit(X_train, y_train) # Fit the training data to the model visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() ##Apply different algos as on X_train,X_test,y_train,y_test # Fitting K-NN to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) classifier.fit(X_train, y_train) # Predicting the Test set results pred_y = classifier.predict(X_test)