# Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
# Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html) # ## Multicollinearity # # Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
# This block creates the objects for each statistic we'd like printed to Excel.<br> # <br> # It will report $R^2$, $R^2 adj,$ residuals, the f p-value, aic, the fitted model parameters, normality of the residuals, the Bruesh-Pagan Test for heteroscedasicity and the Harvey-Collier test for linearity. # <codecell> r_squared = model.rsquared r_square_adj = model.rsquared_adj residuals = model.resid p = model.f_pvalue aic = model.aic pvalues = pd.DataFrame(model.pvalues) params = pd.DataFrame(model.params) normality = sms.jarque_bera(model.resid) breush_pagan_hska = sms.het_breushpagan(model.resid, model.model.exog) harvey_collier = sms.linear_harvey_collier(model) # <headingcell level=4> # Print the regression results to Excel # <codecell> Range("Results", "O6").value = "R^2" Range("Results", "P6").value = r_squared Range("Results", "O7").value = "R^2 Adjusted" Range("Results", "P7").value = r_square_adj Range("Results", "O8").value = "p-value" Range("Results", "P8").value = p
def check_rmse(df, response, list_of_list): for item in list_of_list: cols = item print('\nUsing scikit:\n') # instantiate model lm = LinearRegression() # fit model lm.fit(X_train[cols], y_train) # make predictions y_pred = lm.predict(X_test[cols]) print('MAE:', mean_absolute_error(y_test, y_pred)) print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred))) print('R-Squared:', r2_score(y_test, y_pred)) print('Score', lm.score(X_test[cols], y_test)) # normality check residual = y_test - y_pred sm.qqplot(residual, stats.distributions.norm, line='r') #plt.show() print('\nUsing statsmodels:\n') formula = "{0} ~ {1}".format(response, '+'.join(cols)) print(formula) model = smf.ols(formula, data=df).fit() y_pred = model.predict(X_test[cols]) print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred))) print(model.params) print(model.summary()) print('\nCHECKING RESIDUALS\n') residual = y_test - y_pred fig, ax = plt.subplots(1,2) # axes are in a two-dimensional array, indexed by [row, col] ax[0].hist(model.resid_pearson) ax[0].set_ylabel('Count') ax[0].set_xlabel('Normalized residuals') ax[1].scatter(residual, y_pred) # normality check res = model.resid sm.qqplot(res, stats.distributions.norm, line='r') #plt.show() # linearity check ##The Harvey-Collier test performs a t-test (with parameter degrees of freedom) on the recursive residuals. ##If the true relationship is not linear but convex or concave the mean of the recursive residuals should differ from 0 significantly. import statsmodels.stats.api as sms try: harvey_collier = sms.linear_harvey_collier(model) print(harvey_collier) except np.linalg.linalg.LinAlgError as e: print(e) ## equal variation check ### violation of homoscedasticity. ### small pvalue = bad from statsmodels.stats.diagnostic import het_breuschpagan _, pval, __, f_pval = het_breuschpagan(residual, X_test[cols]) print('Pval', pval) print('f_pval', f_pval) ## correlation analysis # Compute matrix of correlation coefficients corr_matrix = np.corrcoef(df[cols].T) print(pd.DataFrame(corr_matrix)) ### https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html # Display heat map fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.imshow(corr_matrix) ax.set_title('Heatmap of correlation matrix') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # We want to show all ticks... ax.set_xticks(np.arange(len(cols))) ax.set_yticks(np.arange(len(cols))) # ... and label them with the respective list entries ax.set_xticklabels(cols) ax.set_yticklabels(cols) # Loop over data dimensions and create text annotations. for i in range(len(cols)): for j in range(len(cols)): text = ax.text(j, i, round(corr_matrix[i, j], 2), ha="center", va="center", color="black") plt.show() print("\nCross validation using scikit:\n") # check cross validation predictions, 10 splits cv_predictions = cross_val_predict(lm, df[cols], df[response], cv=10) # check errors print('MAE:', mean_absolute_error(df[response], cv_predictions)) print('RMSE:', np.sqrt(mean_squared_error(df[response], cv_predictions))) print('R-Squared:', r2_score(df[response],cv_predictions)) #print('Score', accuracy_score(df[response], cv_predictions)) # plot actual vs predicted fig3 = plt.figure(figsize=(6,6)) plt.scatter(x=cv_predictions, y=df[response]) plt.xlabel('Predictions') plt.ylabel('Appliances') df['predictions'] = cv_predictions # plotting fit results fig4, ax = plt.subplots() df.Appliances.plot(ax=ax, style='b-') # same ax as above since it's automatically added on the right df.predictions.plot(ax=ax, style='r-') ax.set_xlabel('Predictions') ax.set_ylabel('Appliances') #print(df.head()) #df.plot(y=cv_predictions, color='red', linewidth=1) #fig4.tight_layout() #plt.show() print('\nCHECKING RESIDUALS\n') residual = df[response] - y_pred # axes are in a two-dimensional array, indexed by [row, col] fig4 = plt.figure(figsize=(6,6)) plt.scatter(df.predictions, residual) plt.hlines(y = 0, xmin = 0, xmax = 250) plt.title('Residual Plot') plt.ylabel('Residuals') plt.show()
name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test3 = sms.het_breuschpagan(lr.resid, lr.model.exog) lzip(name3, test3) #======================Goldfeld-Quandt test: name5 = ['F statistic', 'p-value'] test5 = sms.het_goldfeldquandt(lr.resid, lr.model.exog) lzip(name5, test5) #================================================ #================================================ #==================================Linearity test #======================Harvey-Collier: name6 = ['t value', 'p value'] test6 = sms.linear_harvey_collier(lr) lzip(name6, test6) import statsmodels.stats.diagnostic as ssd name6 = ['t value', 'p value'] test6 = ssd.acorr_linear_rainbow(lr) lzip(name6, test6) #================================================ #================================================ #====Serial correlation (or) Autocorrelation test #======================Durbin_watson: #Durbin-Watson test for no autocorrelation of residuals #printed with summary() from statsmodels.stats.stattools import durbin_watson print("Durbin-Watson: ", durbin_watson(lr.resid))
def harveyCollier(results): name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
def diagnostic_plots(self, linear_model): """ :param linear_model: Linear Model Fit on the Data :return: None This method validates the assumptions of Linear Model """ diagnostic_result = {} summary = linear_model.summary() #diagnostic_result['summary'] = str(summary) # fitted values fitted_y = linear_model.fittedvalues # model residuals residuals = linear_model.resid # normalized residuals residuals_normalized = linear_model.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(residuals_normalized)) # leverage, from statsmodels internals leverage = linear_model.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals cooks = linear_model.get_influence().cooks_distance[0] self.check_linearity_assumption(fitted_y, residuals) self.check_residual_normality(residuals_normalized) self.check_homoscedacticity(fitted_y, model_norm_residuals_abs_sqrt) self.check_influcence(leverage, cooks, residuals_normalized) # 1. Non-Linearity Test try: name = ['F value', 'p value'] test = sms.linear_harvey_collier(linear_model) linear_test_result = lzip(name, test) except Exception as e: linear_test_result = str(e) diagnostic_result['Non_Linearity_Test'] = linear_test_result # 2. Hetroskedasticity Test name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breuschpagan(linear_model.resid, linear_model.model.exog) test_val = lzip(name, test) diagnostic_result['Hetroskedasticity_Test'] = test_val # 3. Normality of Residuals name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(linear_model.resid) test_val = lzip(name, test) diagnostic_result['Residual_Normality_Test'] = test_val # 4. MultiCollnearity Test test = np.linalg.cond(linear_model.model.exog) test_val = [('condition no',test)] diagnostic_result['MultiCollnearity_Test'] = test_val # 5. Residuals Auto-Correlation Tests test = sms.durbin_watson(linear_model.resid) test_val = [('p value', test)] diagnostic_result['Residual_AutoCorrelation_Test'] = test_val json_result = json.dumps(diagnostic_result) return summary, json_result