def f_stat(self): """The F-statistic of the regression Returns ---------- float F-statistic of beta coefficients using regressors.stats """ return regressors_stats.f_stat(self.ols, self.X, self.y)
def __init__(self, X, y, saved_reg=None): """ :param X: independent data :type X: np.array :param y: dependent data :type y: list """ if saved_reg is None: self.reg = linear_model.LinearRegression() self.ols = self.reg.fit(X, y) else: self.reg = saved_reg.reg self.ols = saved_reg.ols self.y_intercept = self.reg.intercept_ self.slope = self.reg.coef_ params = np.append(self.y_intercept, self.slope) self.predictions = self.reg.predict(X) self.r_sq = r2_score(y, self.predictions) self.mse = mean_squared_error(y, self.predictions) self.p_values, self.sd_b, self.ts_b = get_p_values( X, y, self.predictions, params) self.residuals = np.subtract(y, self.predictions) self.norm_prob_plot = scipy_stats.probplot(self.residuals, dist='norm', fit=False, plot=None, rvalue=False) reg_prob = linear_model.LinearRegression() reg_prob.fit([[val] for val in self.norm_prob_plot[0]], self.norm_prob_plot[1]) self.y_intercept_prob = reg_prob.intercept_ self.slope_prob = reg_prob.coef_ self.x_trend_prob = [ min(self.norm_prob_plot[0]), max(self.norm_prob_plot[0]) ] self.y_trend_prob = np.add( np.multiply(self.x_trend_prob, self.slope_prob), self.y_intercept_prob) self.f_stat = regressors_stats.f_stat(self.ols, X, y) self.df_error = len(X[:, 0]) - len(X[0, :]) - 1 self.df_model = len(X[0, :]) self.f_p_value = scipy_stats.f.cdf(self.f_stat, self.df_model, self.df_error)
def _modified_regressor_summary(clf, X, y, xlabels=None): """ Output summary statistics for a fitted regression model. Parameters ---------- clf : sklearn.linear_model A scikit-learn linear model classifier with a `predict()` method. X : numpy.ndarray Training data used to fit the classifier. y : numpy.ndarray Target training values, of shape = [n_samples]. xlabels : list, tuple The labels for the predictors. """ # Check and/or make xlabels ncols = X.shape[1] if xlabels is None: xlabels = np.array( ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str') elif isinstance(xlabels, (tuple, list)): xlabels = np.array(xlabels, dtype='str') # Make sure dims of xlabels matches dims of X if xlabels.shape[0] != ncols: raise AssertionError( "Dimension of xlabels {0} does not match " "X {1}.".format(xlabels.shape, X.shape)) # Create data frame of coefficient estimates and associated stats coef_df = pd.DataFrame( index=['_intercept'] + list(xlabels), columns=['Estimate', 'Std. Error', 't value', 'p value'] ) coef_df['Estimate'] = np.concatenate( (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6))) coef_df['Std. Error'] = np.round(stats.coef_se(clf, X, y), 6) coef_df['t value'] = np.round(stats.coef_tval(clf, X, y), 4) coef_df['p value'] = np.round(stats.coef_pval(clf, X, y), 6) # Create data frame to summarize residuals resids = stats.residuals(clf, X, y, r_type='raw') resids_df = pd.DataFrame({ 'Min': pd.Series(np.round(resids.min(), 4)), '1Q': pd.Series(np.round(np.percentile(resids, q=25), 4)), 'Median': pd.Series(np.round(np.median(resids), 4)), '3Q': pd.Series(np.round(np.percentile(resids, q=75), 4)), 'Max': pd.Series(np.round(resids.max(), 4)), }, columns=['Min', '1Q', 'Median', '3Q', 'Max']) return resids_df, coef_df, {'R2': stats.metrics.r2_score(y, clf.predict(X)), 'Adj R2': stats.adj_r2_score(clf, X, y), 'F-statistic': stats.f_stat(clf, X, y)}
def calculate_f_stat(self): return stats.f_stat(self.model, self.params_df, self.result_nd)