def fitter(df_t, clade, protein): ''' :param df: takes a clade or country df :return: prints model summary ''' global df df_t = df_t.select_dtypes(include=['float64', 'int64']) df_t[f'score for {protein}'] = df[f'score for {protein}'] X = df_t.drop('death_growth_rate', axis=1) y = df_t[['death_growth_rate']] # print(X.head()) # print(y.head()) lm = sm.OLS(y, X) model = lm.fit() print(model.summary()) #print('Parameters: ', model.params) print('R2: ', model.rsquared) print('P-value: ', model.f_pvalue) '''applys linearity test and saves''' fig, ax = plt.subplots(1, 1) sns.residplot(model.predict(), y, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'}, ax=ax) ax.title.set_text('Residuals vs Fitted') ax.set(xlabel='Fitted', ylabel='Residuals') fig.savefig(f'linearity_test_{clade}_{protein}.png') plt.show() plt.close(fig) return model, clade, protein
def model_diagnosis(model, x, x_name, y): """ returns a subplot of the model's distributions (QQPlot, histogram, and a scatter plot of the homoscedasticity model is the model variable (model's name) x is the object of the feature x_name is the feature's variable y is the object of the target variable """ residuals = model.resid fig = plt.figure() fig, axes = plt.subplots(nrows = 1, ncols = 3, sharex=False, sharey = False, figsize=(65,20), squeeze=False) fig.text(s="Summary of Model Diagnosis", x=.40, y=1.15, fontsize=40) fig.text(s="r-Squared: {r}".format(r=round(model.rsquared, 2)), x=.40, y=1.05, fontsize=30) fig.text(s="p-value: {p}".format(p= model.f_pvalue), x=.53, y=1.05, fontsize=30) fig.text(s='Residuals Histogram', x=.45, y=.93, fontsize=55) sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True, ax=axes[0][0]) plt.show; fig.text(s='QQ Plot', x=.20, y=.93, fontsize=45) axes[0][1].hist(residuals) plt.show; sns.residplot(x, y, ax=axes[0][2]) fig.text(s='Residuals vs {x}'.format(x=x_name), x=.73, y=.93, fontsize=40) plt.tight_layout; plt.show;
def sk_rsme(outcome, features, train_set, test_set): """ Takes in outcome, features and dataset and returns rsme and r^2 using sklearn outcome: String of title of outcome column features: list of strings of feature columns train_set: pandas dataframe containing columns that include outcome and features as titles to train model on test_set: pandas dataframe containing columns that include outcome and features as titles to test model on """ lr = LinearRegression() X_train = train_set[features] y_train = train_set[outcome] lr.fit(X_train, y_train) X_test = test_set[features] y_test = test_set[outcome] y_train_pred = lr.predict(X_train) y_test_pred = lr.predict(X_test) rmse_train = mean_squared_error(y_train, y_train_pred, squared=False) rmse_test = mean_squared_error(y_test, y_test_pred, squared=False) sns.residplot(y_train_pred, train_set['price'], lowess=True, color="g") return (rmse_train, rmse_test, float(lr.score(X_train, y_train)))
def linear_model(era): data = scaled_df[scaled_df['Era'] == era] x = data.drop(['Season', 'Team', 'wPCT', 'Era', 'wPCT > 0.500'], axis=1) x = sm.add_constant(x) y = data['wPCT'] lm = sm.OLS(y, x).fit() print('------- Linear Regression Result ({}) -------'.format(era)) print(lm.summary()) # residual plot fitted_y = lm.fittedvalues fig, ax = plt.subplots(figsize=(8, 6)) sns.residplot(fitted_y, 'wPCT', data=data, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }, ax=ax) ax.set_title('Residuals vs Fitted Values ({})'.format(era)) ax.set_xlabel('Fitted Values') ax.set_ylabel("Residuals") plt.show()
def scatter(filename, query=[]): data = pd.read_csv(filename) X, Y = data.columns x, y = data[X], data[Y] slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) pearson = stats.pearsonr(x, y) seaborn.regplot(x=x, y=y, line_kws={ 'label': "$y = {0:.2f}x + {1:.2f}$".format(slope, intercept) }) plt.legend() plt.show() seaborn.residplot(x=x, y=y) plt.ylabel(Y + " (Residuals)") plt.show() print(filename) print(len(x), sum(x), sum(y), sum(x * x), sum(y * y), sum(x * y)) print(slope, intercept) print(pearson) for q in query: print(q, slope * q + intercept) print()
def regression_model(column_x, column_y): # this function uses built in library functions to create a scatter plot, # plots of the residuals, compute R-squared, and display the regression eqn # fit the regression line using "statsmodels" library: X = statsmodels.add_constant(column_x) Y = column_y regressionmodel = statsmodels.OLS(Y,X).fit() #OLS = "ordinary least squares" # extract regression parameters from model, rounded to 3 decimal places: Rsquared = round(regressionmodel.rsquared,3) slope = round(regressionmodel.params[1],3) intercept = round(regressionmodel.params[0],3) # make plots: fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, figsize=(12,4)) sns.regplot(x=column_x, y=column_y, data=df, marker="+", ax=ax1) # scatter plot sns.residplot(x=column_x, y=column_y, data=df, ax=ax2) # residual plot ax2.set(ylabel='Residuals') ax2.set_ylim(min(regressionmodel.resid)-1,max(regressionmodel.resid)+1) plt.figure() # histogram sns.distplot(regressionmodel.resid, kde=False, axlabel='Residuals', color='red') # print the results: # print("R-squared = ",Rsquared) # print("Regression equation: y =",slope, "x + ",intercept) return slope, intercept
def residPlots(self, exclude=[], include=[]): ''' Using only quantitaive data for each variable in the include arg or variables not in the exclude arg plot the residual values with respect to the independent variable. You do not want to see patterns from these plots. ''' # y^ - y vs x if exclude and include: raise ValueError("You can't use both include and exclude.") elif exclude and not include: variables = [ var for var in self.quant_vars if not (var in exclude) ] elif not exclude and include: variables = [var for var in self.quant_vars if var in include] else: variables = self.quant_vars df = self.data for var in variables: plt.figure(self.figureindex) sns.residplot(var, self.dep_var, data=df) plt.title('Residual plot ' + var) plt.show() self.figureindex += 1
def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response): feature = X.columns f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False) sns.regplot(X[feature[4]], y, test_data, ax=ax1) sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2) model.fit(X_train, y_train) sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3) if model_name is 'linear': sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5}) elif model_name is 'logistic': pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True) levels = np.linspace(0, 1, 11) sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True) else: pass ax1.set_title('Regression') ax2.set_title(feature[4]+' Value') ax3.set_title(feature[4]+' Residuals') ax4.set_title('Two-value Interaction') f.tight_layout() plt.savefig(model_name+'_'+feature[4], bbox_inches='tight') # Multi-variable correlation significance level f, ax = plt.subplots(figsize=(10, 10)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap) ax.grid(False) ax.set_title('Multi-variable correlation significance level') plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight') # complete coefficient plot - believe this is only for linear regression sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True) plt.xticks(rotation='vertical') plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
def plot_residplot(do_values_df, order, save=None): """ Plot residual plot, which can help in determining if there is structure to the residuals. :type do_values_df: Pandas Dataframe :param do_values_df: Dataframe containing dissimilarity and overlap values with lowess predictions. :type order: int :param order: Order of the polynomial to fit when calculating the residuals. :type save: boolean :param save: Set to save the plot. """ fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) sns.residplot(x="Overlap", y="Dissimilarity", data=do_values_df, lowess=True, order=order, ax=ax) ax.set_title("Residual Plot", weight="bold", fontsize=15) if save is not None: fig.savefig(save, facecolor="white", edgecolor="none", format="svg", bbox_inches="tight", pad_inches=0.1) else: plt.show()
def residuals(df, target, cols=3, figsize=(10, 15), hspace=1, wspace=1): ''' Create residual plots for all numeric features. Useful for seeing heteroscedasticity. Input: df: Pandas DataFrame object target: string of the target feature in df cols: number of graphs to display per row figsize: tuple of floats representing height and width of the plots hspace: the amount of height reserved for space between subplots wspace: the amount of width reserved for space between subplots Output: Display n graphs to the screen, where n is the number of numeric features in df ''' X = df.drop(target, axis=1).select_dtypes(include='number') y = df[target] # plot settings fig = plt.figure(figsize=figsize) fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace) rows = ceil(float(df.shape[1]) / cols) # plot graphs for i, column in enumerate(X.columns): ax = fig.add_subplot(rows, cols, i + 1) sns.residplot(X[column], y, line_kws=dict(color='r'), lowess=True) ax.set_title(column)
def _plot_residuals(model, y, ax): fitted = model.predict() residuals = y - fitted sns.residplot(fitted, residuals, lowess=True, line_kws=line_kw, scatter_kws=scatter_kw, ax=ax) ax.set_xlabel('fitted values') ax.set_ylabel('residuals') ax.set_title('Residuals vs Fitted')
def diagnostics(fittedvalues, residuals, data, predictand): sns.residplot(fittedvalues, predictand, data=data, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) # qq plot of residuals vs. normal fit sm.qqplot(residuals, ss.norm, fit=True, line='45') # autocorrelation of residuals fig = plt.figure() ax = fig.add_subplot(2, 1, 1) sm.graphics.tsa.plot_acf(residuals, ax=ax) ax.set_xlim([0, 10]) ax = fig.add_subplot(2, 1, 2) sm.graphics.tsa.plot_pacf(residuals, ax=ax) ax.set_xlim([0, 10]) return None
def evaluate_model(model, X_test, y_test, plot_residuals=False, title=''): from sklearn.metrics import mean_squared_error, explained_variance_score y_pred = model.predict(X_test) if plot_residuals: _, ax = plt.subplots(figsize=(9, 9)) ax.set_title('Residuals Plot - ' + title, fontsize=19) ax.set_xlabel('Predicted values', fontsize=15) ax.set_ylabel('Residuals', fontsize=15) sns.residplot(y_pred.squeeze(), y_test.squeeze(), lowess=True, ax=ax, scatter_kws={'alpha': 0.3}, line_kws={ 'color': 'black', 'lw': 2, 'ls': '--' }) metrics = { 'explained_variance': explained_variance_score(y_test, y_pred), 'mse': mean_squared_error(y_test, y_pred) } return metrics
def residual_analysis(df, x, y, model) : preds = model(df[x]) residuals = df[y] - preds stdResiduals = residuals / residual_standard_error(residuals) fig, axs = plt.subplots(2, 2, figsize=(8,8)) axs[0][0].scatter(df[x], df[y]) axs[0][0].plot(df[x], preds) axs[0][0].set_xlabel(x), axs[0][0].set_ylabel(y) plt.suptitle(" Raw plus fit / Response v Residuals \n Explan v Residuals / Residual hist") axs[0][1].scatter(df[y], stdResiduals) xRange = np.arange(df[y].min(), df[y].max(), df[y].max()/10) axs[0][1].plot(xRange, [0]*len(xRange), linestyle="--", color="gray") plt.xlabel(y), plt.ylabel("residual") sns.residplot(df[x], df[y], lowess=True, ax=axs[1][0]) plt.ylabel("residual") sns.distplot(stdResiduals, ax=axs[1][1]) axs[1][1].set_xlabel("residual") plt.show() fig, axs = plt.subplots(figsize=(5,5)) stats.probplot(stdResiduals, plot=plt) plt.show()
def residual_plot(self, ax=None): """ Residual vs Fitted Plot Graphical tool to identify non-linearity. (Roughly) Horizontal red line is an indicator that the residual has a linear pattern """ if ax is None: fig, ax = plt.subplots() sns.residplot(x=self.y_predict, y=self.residual, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }, ax=ax) # annotations residual_abs = np.abs(self.residual) abs_resid = np.flip(np.sort(residual_abs)) abs_resid_top_3 = abs_resid[:3] for i, _ in enumerate(abs_resid_top_3): ax.annotate(i, xy=(self.y_predict[i], self.residual[i]), color='C3') ax.set_title('Residuals vs Fitted', fontweight="bold") ax.set_xlabel('Fitted values') ax.set_ylabel('Residuals') return ax
def resid_plot(self, var_x=None, var_y=None): """ Create a residuals VS fitted plot Input: - var_x: A list of predictor variable(s) (default=None) - var_y: A response variable (default=None) """ dataframe = self.get_dataframe() # priority: arguments var_x, var_y if var_x is not None and var_y is not None: self.reg(var_x, var_y) else: var_x = self.get_predictors() var_y = self.get_response() if var_x is not None and var_y is not None: self.reg(var_x, var_y) else: raise ValueError('No predictors or response assigned') model = self.get_model() fitted = model.fittedvalues sns.residplot(fitted, var_y, data=dataframe, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 1 }) plt.title('Residuals vs Fitted') plt.xlabel('Fitted values') plt.ylabel('Residuals')
def scale_transform_validate(df, target, residuals=False, selection=False): """ Scale features and Evaluate model params : df_features - Data frame with features (numeric) target - series with numeric values! residuals = False. Plot residuals if True """ X_train, X_test, y_train, y_test = train_test_split(df, target, random_state=9, test_size=0.2) scaler = StandardScaler() # fit the scaler to the training data scaler.fit(X_train) #transform the training data scaled_data = scaler.transform(X_train) # create DF X_train_scaled = pd.DataFrame(data=scaled_data, columns=df.columns, index=X_train.index) #transform the test data scaled_test_data = scaler.transform(X_test) #create dataframe X_test_scaled = pd.DataFrame(data=scaled_test_data, columns=df.columns, index=X_test.index) # Fit the model to the training data. lm = LinearRegression() lm = lm.fit(X_train_scaled, y_train) # Use the model to predict on the training set and the test set. y_train_pred = lm.predict(X_train_scaled) y_test_pred = lm.predict(X_test_scaled) train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)) test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)) if residuals == True: sns.residplot(y_test, y_test_pred, lowess=True, color='g') if selection == True: return round(train_rmse), round(test_rmse) #Comparing our Model's performance on training data versus test data else: print('Training: RMSE', int(train_rmse), "vs. Testing: RMSE", int(test_rmse)) print('Perfomance : {} %'.format( round(abs((test_rmse - train_rmse) / train_rmse) * 100))) return plt.show()
def checkReg(t, rMean): time = t roiMean = rMean sns.residplot(time, roiMean , lowess=False, color="g") #sns.regplot(time, roiMean, lowess=False, color="g")
def plot_residuals(df, actual, predicted): ''' user inputs a dataframe, then Data or column name in data for the predictor variable. then Data or column name in data for the response variable. And function will produce a scatter plot with a baseline ''' sns.residplot(x=actual, y=predicted, data=df)
def compare_error_variance(df1, df2, predictor, target): """Comparing before and after adjusted target vs feature """ fig, (ax1, ax2) = plt.subplots(figsize=(22, 6), ncols=2, sharey=False, sharex=False) sns.residplot(x=df1[predictor], y=df1[target], ax=ax1).set_title('Before') sns.residplot(x=df2[predictor], y=df2[target], ax=ax2).set_title('After')
def sns_residplot_impl( self, x_data, # column name from df (str) y_data, # column name from df (str) df, scatter_color='green'): sns.residplot(x=x_data, y=y_data, data=df, color=scatter_color) plt.show()
def residual_predicted_df(predicted_df): import seaborn as sns sns.set(style="whitegrid") # Plot the residuals after fitting a linear model sns.residplot(predicted_df.observed.values, predicted_df.predicted.values, lowess=True, color="g") pr.plt.show()
def plot_residual_fitted_values(self, y): sns.residplot(self.model.fittedvalues, y, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8}) plt.title('Resíduos vs Ajustados', fontsize=20) plt.xlabel('Valores Ajustados') plt.ylabel('Resíduos') top_3_abs_resid = self.model_abs_resid.sort_values(ascending=False)[:3] for index in top_3_abs_resid.index: plt.annotate(index, xy=(self.model.fittedvalues[index], self.model.resid[index]))
def get_results(Y_test, predictions, model_name): mae_RFR = mean_absolute_error(predictions, Y_test) print("MAE: {}".format(mae_RFR)) sns.set(style="whitegrid") fig, axs = plt.subplots(ncols=2, sharey=False, figsize=(15, 5)) sns.residplot(predictions, Y_test, color="g", ax=axs[0]).set_title("Residuals plot of " + model_name) sns.scatterplot(x=Y_test, y=predictions, ax=axs[1]).set_title("Model Error") axs[1].set(xlabel='True Values', ylabel='Predicted Values')
def residPlots(dict_stats,key): fig.clf() ax = fig.add_subplot() sns.residplot(x=dict_stats[key]['sub_charge'], y=dict_stats[key]['sub_slope'], ax=ax) ax.set_title("{0} Charge Residual".format(key),size=title_text_size) ax.tick_params(axis='y', labelsize=tick_text_size) ax.tick_params(axis='x', labelsize=tick_text_size) ax.set_xlabel('Charge', size=axis_text_size) ax.set_ylabel('Slope Residual', size=axis_text_size) app.queueFunction(app.refreshPlot, "p1")
def wykres_5(x, y, nazwa_wykres, nazwa_x, nazwa_y): f, ax = plt.subplots(figsize=(10, 8)) ax.set_title(nazwa_wykres, fontsize=16) ax.set_ylabel(nazwa_y, fontsize=14) ax.set_xlabel(nazwa_x, fontsize=14) sns.residplot(data=selected_data, x=x, y=y, lowess=True) return f
def eda(team_df): #basic statistics team_stats = team_df[num_cols].describe().T team_skew_values = list() for num_col in team_stats.index: num_col_skew = stats.skew(team_df[num_col]) team_skew_values.append(num_col_skew) team_stats['skew'] = team_skew_values #heatmap fig = plt.figure(figsize=(20, 10)) sns.heatmap(team_df[num_cols].corr(), annot=True) #distribution plot fig = plt.figure(figsize=(30, 40)) for i, num_col in enumerate(team_stats.index): fig.add_subplot(7, 2, 1 + i) sns.distplot(team_df[num_col]) mean_value = team_df[num_col].mean() plt.axvline(mean_value, c='red') median_value = team_df[num_col].median() plt.axvline(median_value, c='green') #attendance box plot by year fig = plt.figure(figsize=(20, 10)) order = set(team_df['year'].values) sns.boxplot(x='year', y='attendance', data=team_df, order=order) #scatterplot with attendance fig = plt.figure(figsize=(20, 20)) for i, num_col in enumerate(num_cols): fig.add_subplot(7, 2, 1 + i) plt.scatter(team_df[num_col], team_df['attendance']) #residualplot fig = plt.figure(figsize=(20, 20)) for i, num_col in enumerate(num_cols): fig.add_subplot(7, 2, 1 + i) sns.residplot(x=num_col, y='attendance', data=team_df) #swarmplot fig = plt.figure(figsize=(15, 5)) sns.swarmplot(x='month', y="attendance", hue='weekday', data=team_df) fig = plt.figure(figsize=(15, 5)) sns.swarmplot(x='weekday', y="attendance", hue='month', data=team_df) #pointplot fig = plt.figure(figsize=(10, 5)) sns.pointplot(x="weekday", y="attendance", hue="month", data=team_df) fig = plt.figure(figsize=(10, 5)) sns.pointplot(x="month", y="attendance", hue="weekday", data=team_df) return team_stats
def Q_6_12(): X = pd.read_csv('Data/Q_6_12.csv') data = X X.drop(columns='Y') Y = data['Y'] pca = PCA(n_components=2) pca.fit(X) newX = pca.fit_transform(X) print('贡献率:\n',pca.explained_variance_ratio_) # 输出贡献率 print('主成分分析后的自变量:\n',newX) X_train,X_test,Y_train,Y_test = train_test_split(newX,Y,train_size=.80) model = LinearRegression() model.fit(X_train,Y_train) a = model.intercept_ # 截距 b = model.coef_ # 回归系数 print("最佳拟合线:截距", a, ",回归系数:",b) # R方检测 # 决定系数r平方 # 对于评估模型的精确度 # y误差平方和 = Σ(y实际值 - y预测值)^2 # y的总波动 = Σ(y实际值 - y平均值)^2 # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动 # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方 # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化) # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合 score = model.score(X_test,Y_test) print('R方检测: ',score) # 对线性回归进行预测 Y_pred = model.predict(X_test) print(Y_pred) # 显示图像 plt.figure() plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict") plt.plot(range(len(Y_pred)),Y_test,'r',label="test") plt.legend(loc="upper right") # 显示图中的标签 plt.xlabel("the number of Y") plt.ylabel('Y') plt.title('预测与源数据对比图') plt.savefig("result/线性统计/6.12/compare_linear.jpg") plt.show() # 残差预测值 # enumerate 函数可以把一个 list 变成索引-元素对 y_dif = [] for i in range(len(Y_pred)): y_dif.append(Y_pred[i]-Y_test.values[i]) tmp = {'x':range(len(y_dif)),'y':y_dif} df = pd.DataFrame(tmp) sns.residplot(x="x", y="y",data=df) plt.savefig("result/线性统计/6.12/残差图.jpg") plt.title('残差图') plt.show()
def model_evaluation(): """ Tells us how our model performs in the real world Difference with in-sample evaluation: - In-sample tells us how well our model fits the data already given to train it - Problem: It does not tell us how well the trained model can be used to predict new data - Solution: Split data in sets: - Training data: Train it with in-sample evaluation - Testing data: Example: - Train 70% of the data - Test 30% of the data There exists a generalization error that involves the percentages of data used for training and testing. TO overcome this issue, we use Cross Validation - Most common out-of-sample (testing) evaluation metric - More effective use of data (each observation is used for both training and testing) """ from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from sklearn.linear_model import LinearRegression df = util.create_df() x_data = df[['highway-mpg']] y_data = df['price'] lr = LinearRegression() lr.fit(x_data, y_data) # cv specifies how many folds to use scores = cross_val_score(lr, x_data, y_data, cv=3) print(f'Mean: {scores.mean()}. Standard deviation: {scores.std()}') predicted_scores = cross_val_predict(lr, x_data, y_data, cv=3) # Visualize the model width, height = 12, 10 plt.figure(figsize=(width, height)) sns.regplot(x='highway-mpg', y='price', data=df) plt.ylim(0, ) plt.show() plt.clf() plt.figure(figsize=(width, height)) sns.regplot(x="peak-rpm", y="price", data=df) plt.ylim(0, ) plt.show() plt.clf() plt.figure(figsize=(width, height)) sns.residplot(df['highway-mpg'], df['price']) plt.show()
def get_residual_plot(X_col, y, rows=1, cols=1, position=1, ax=False, title=False): plt.subplot(rows, cols, position) sns.residplot(X_col, y) if title: ax = plt.xlabel(title)
def fitAndPlot(x,y,title, degree = 1 , color='r'): if degree > 0: rm = polyfit(x,y,degree) rpyp = polyval(rm,x) plot(x, y,color+'o',label=title) ttlAndEqn = title +":y="+str(rm)[1:6]#+" + "+str(rm)[1:6]+"x" degree-=1 #while degree>0: # ttlAndEqn = ttlAndEqn + "" plot(x, rpyp, color+'--',label=ttlAndEqn) else: sns.residplot(x,y)
def fitAndPlot(x, y, title, degree=1, color='r'): if degree > 0: rm = polyfit(x, y, degree) rpyp = polyval(rm, x) plot(x, y, color + 'o', label=title) ttlAndEqn = title + ":y=" + str(rm)[1:6] #+" + "+str(rm)[1:6]+"x" degree -= 1 #while degree>0: # ttlAndEqn = ttlAndEqn + "" plot(x, rpyp, color + '--', label=ttlAndEqn) else: sns.residplot(x, y)
import seaborn as sns # In[10]: # lm = linear model g = sns.lmplot("Temp", "Gas", hue="Insul", data=whiteside, size=8) g.set_axis_labels("Temp", "Gas") # In[11]: # lm residuals sns.residplot('Temp', 'Gas', data=whiteside[whiteside.Insul=='Before']) # In[12]: # histogram of data in each dimension sns.jointplot("Temp", "Gas", data=whiteside[whiteside.Insul=='After'], kind='reg', size=8, xlim=(-5,15), color='b') plt.title("After") # In[13]: # pandas has built-in boxplot function for # looking at each column in dataframe
sns.set_style("ticks") sns.despine(offset=10, trim=True) fig = plt.figure(figsize=(13,12)) ax = fig.add_subplot(111) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() plt.ylim(-500000000, 700000000) plt.xlim(-50000000, 600000000) ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('%.0f')%(x*1e-8))) ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('%.0f')%(x*1e-8))) plt.xticks(fontsize=20) plt.yticks(fontsize=20) sns.residplot('yhat', 'foreign', data=df2, color = '#226b53', scatter_kws={"s": 80}, lowess = True) ax.set_xlabel('') plt.ylabel('Residuals', fontsize=20) fig.savefig('temp.png', transparent=True) # Movie genre bar graph sns.set(style = 'white') fig = plt.figure(figsize=(26,12)) ax = fig.add_subplot(111) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) ax.set_ylabel('')
# In[ ]: # In[164]: sns.lmplot('cholesterol', 'diagnosis', df) # In[165]: f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) sns.regplot('cholesterol', 'diagnosis', df, ax=ax1) sns.boxplot(df["cholesterol"], df["diagnosis"], color="Blues_r", ax=ax2).set_ylabel("") sns.residplot(df["cholesterol"], (linear_reg.predict(features) - response) ** 2, color="indianred", order=1, lowess=True) sns.residplot(df["cholesterol"], (linear_reg.predict(features) - response) ** 2, color="indianred", order=2, lowess=True) ax1.set_title('Regression') ax2.set_title('ax2 title') f.tight_layout() sns.residplot(X.cholesterol, (linear_reg.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3) # f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row') # ax1.plot(x, y) # ax1.set_title('Sharing x per column, y per row') # ax2.scatter(x, y) # ax3.scatter(x, 2 * y ** 2 - 1, color='r') # ax4.plot(x, 2 * y ** 2 - 1, color='r') # f, axarr = plt.subplots(2, 2) # axarr[0, 0].plot(x, y)
""" Plotting model residuals ======================== """ import numpy as np import seaborn as sns sns.set(style="whitegrid") rs = np.random.RandomState(7) x = rs.normal(2, 1, 75) y = 2 + 1.5 * x + rs.normal(0, 2, 75) sns.residplot(x, y, lowess=True, color="navy")
# Calcul d'une régression linéaire sur variable explicative centrée/réduite df = df_densites.dropna() X = df.iloc[:, 0] y = df['MOYENNE DEPASSEMENT'] # centrer/réduire X mean = X.mean() std = X.std() Xcr = (X - mean)/std dat = pd.concat([Xcr, y-y.mean()], axis=1) dat.columns = ['DENSITE_SPECIALISTES', 'MOYENNE_DEPASSEMENT'] #print(dat) #plt.scatter(Xcr, y, c='blue', alpha=0.5) ax = sns.regplot(x='DENSITE_SPECIALISTES', y='MOYENNE_DEPASSEMENT', data=dat) plt.show() ax = sns.residplot(x=Xcr, y=y-y.mean()) plt.show() # # Régression linéaire par statmodels # result = smf.ols('MOYENNE_DEPASSEMENT ~ DENSITE_SPECIALISTES', data=dat).fit() print(result.params) sns.distplot(result.resid, fit=norm) plt.title('Residuals vs. Normal distribution') plt.show() #print(df_densites) # corrélation entre densité de bébés et de pédiatres # Calcul d'une régression linéaire sur variable explicative centrée/réduite
''' Plotting residuals of a regression Often, you don't just want to see the regression itself but also see the residuals to get a better idea how well the regression captured the data. Seaborn provides sns.residplot() for that purpose, visualizing how far datapoints diverge from the regression line. In this exercise, you will visualize the residuals of a regression between the 'hp' column (horse power) and the 'mpg' column (miles per gallon) of the auto DataFrame used previously. INSTRUCTIONS 100XP Import matplotlib.pyplot and seaborn using the standard names plt and sns respectively. Generate a green residual plot of the regression between 'hp' (on the x-axis) and 'mpg' (on the y-axis). You will need to specify the additional data and color parameters. Display the plot as usual using plt.show(). This has been done for you, so hit 'Submit Answer' to view the plot. ''' # Import plotting modules import matplotlib.pyplot as plt import seaborn as sns # Generate a green residual plot of the regression between 'hp' and 'mpg' sns.residplot(x='hp', y='mpg', data=auto, color='green') # Display the plot plt.show()
plt.show() # Calculate the predicted brainweights and differences from the observed values training_set['predictions'] = np.dot(features_norm, thetas) training_set['difference'] = training_set['predictions'] - training_set['p.Open'] training_set.head() print training_set # Plot the predicted against the observed values p = sns.lmplot("predictions", "open", data=training_set, size=7) p.set_axis_labels("Predicted Open", "Actual Open") plt.show() # Plot the residuals p = sns.residplot(training_set.predictions, training_set.open, lowess=True) plt.show() # Calculate the coefficient of determination (r^2) y = np.array(training_set.open) p = np.array(training_set.predictions) xbar = np.mean(y) r_squared = 1 - np.square(y - p).sum() / np.square(y - xbar).sum() print r_squared mse = ((y - p) ** 2).mean(axis=None) print "Alpha:" + str(alpha) print "Number of Iterations: " + str(iterations)
mpg = auto_csv[:, 0] hp = auto_csv[:, 3] auto = df1 tips_csv = pd.read_csv('tips.csv') tips = tips_csv import matplotlib.pyplot as plt import seaborn as sns sns.lmplot(x='total_bill', y='tip', data=tips, hue='sex', palette='Set1') plt.show() sns.residplot(x='total_bill', y='tip', data=tips, color='indianred') plt.show() sns.stripplot(x='day', y='tip', data=tips) plt.ylabel('tip ($)') plt.show() sns.stripplot(x='day', y='tip', data=tips, jitter=True, size=4) plt.ylabel('tip ($)') plt.show() sns.swarmplot(x='day', y='tip', data=tips) plt.ylabel('tip ($)') plt.show() sns.swarmplot(x='day', y='tip', data=tips, hue='sex')