def rmse(data, yvar, xvars): raise NotImplementedError('No dsf support yet.') Y = data[yvar] X = data[xvars] X['intercept'] = 1. result = sm.OLS(Y, X).fit() return pd.Series({'ivol':sqrt(result.mse_resid), 'n':result.nobs})
def get_beta(values1, values2): # http://statsmodels.sourceforge.net/stable/regression.html model = sm.OLS(values1, values2) results = model.fit() return results.params[0] value1 = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0] value2 = [1.75, 2.45, 3.81, 4.80, 7.00, 8.60] print(get_beta(value1, value2))
def fit(self, data_frame: DataFrame): self.data_frame = data_frame self.source_matrix = np.vstack([ data_frame.temperature**i - data_frame.reference_temperature**i for i in [-1, 1, 2] ]).T self.aux_fit = statsmodels.OLS(data_frame.experiment, self.source_matrix).fit() self.fit_coefficients = self.aux_fit.params self.fit = np.dot(self.source_matrix, self.fit_coefficients) self.heat_capacity_matrix = np.vstack( [i * data_frame.temperature**(i - 1) for i in [-1, 1, 2]]).T self.fit_heat_capacity = np.dot(self.heat_capacity_matrix, self.fit_coefficients)
mytime = MyPackage.MyClass_Time.MyClass_Time() #时间类 myDA = MyPackage.MyClass_DataAnalysis.MyClass_DataAnalysis() #数据分析类 #------------------------------------------------------------ Path = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\数据及源代码" Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答" #1. import matplotlib.pyplot as plt x = list(range(1952, 2016, 4)) y = (29.3, 28.8, 28.5, 28.4, 29.4, 27.6, 27.7, 27.7, 27.8, 27.4, 27.8, 27.1, 27.3, 27.1, 27.0, 27.5) plt.plot(x, y) plt.show() import statsmodels.api as sm model = sm.OLS(y, sm.add_constant(x)).fit() print(model.summary()) data = pd.DataFrame({"x": x, "y": y}) data myDA.ols("y~x", data, True) #2. import pandas as pd Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答" EU = pd.read_csv(Path2 + '/Part2/005/EuStockMarkets.csv') plt.plot(EU.DAX, EU.FTSE, '.') plt.xlabel('DAX') plt.ylabel('FTSE')
def qqPlot(array): modelFit = statsmodels.OLS(array).fit residuals = modelFit.resid fig = statsmodels.qqplot(residuals) plt.show()
def durbinWatson(array): modelFit = statsmodels.OLS(array).fit residuals = modelFit.resid statsmodels.stats.stattools.durbin_watson(residuals, axis=0)
df = pd.read_csv("data\world-happiness-report-2021.csv") y_feature = ["Ladder score"] * 6 features = [ "Logged GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices", "Generosity", "Perceptions of corruption" ] dependent = df["Ladder score"] independent = df[[ "Logged GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices", "Generosity", "Perceptions of corruption" ]] model = LinearRegression() model.fit(independent, dependent) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) intercept = model.intercept_ coefficients = model.coef_ print("R2: ", model.score(independent, dependent)) print("Intercept: ", intercept) print("coefficients: ", coefficients) x = ssm.add_constant(independent) model = ssm.OLS(dependent, independent).fit() predictions = model.summary() print(predictions)
make_cats = ['BUICK', 'CADILLAC', 'CHEVROLET', 'CHRYSLER', 'DODGE', 'FORD', 'GMC', 'HONDA', 'HUMMER', 'HYUNDAI', 'INFINITI', 'ISUZU', 'JEEP', 'KIA', 'LEXUS', 'LINCOLN', 'MAZDA', 'MERCURY', 'MINI', 'MITSUBISHI', 'NISSAN', 'OLDSMOBILE', 'PLYMOUTH', 'PONTIAC', 'SATURN', 'SCION', 'SUBARU', 'SUZUKI', 'TOYOTA', 'TOYOTA SCION', 'VOLKSWAGEN', 'VOLVO'] state_cats = ['AR', 'AZ', 'CA', 'CO', 'FL', 'GA', 'IA', 'ID', 'IL', 'IN', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO', 'MS', 'NC', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'TN', 'TX', 'UT', 'VA', 'WA', 'WV'] auction_cats = ['ADESA', 'MANHEIM', 'OTHER'] trans_cats = ['AUTO'] color_cats = ['BEIGE', 'BLACK', 'BLUE', 'BROWN', 'GOLD', 'GREEN', 'GREY', 'MAROON', 'NOT AVAIL', 'ORANGE', 'OTHER', 'PURPLE', 'RED', 'SILVER', 'WHITE', 'YELLOW'] wheel_cats = ['Alloy', 'Covers', 'Special'] nat_cats = ['AMERICAN', 'OTHER', 'OTHER ASIAN', 'TOP LINE ASIAN'] size_cats =['COMPACT', 'CROSSOVER', 'LARGE', 'LARGE SUV', 'LARGE TRUCK', 'MEDIUM', 'MEDIUM SUV', 'SMALL SUV', 'SMALL TRUCK', 'SPECIALTY', 'SPORTS', 'VAN'] year_cats = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010_x'] all_vars = [continuous_vars, make_cats, state_cats, auction_cats, trans_cats, color_cats, wheel_cats, nat_cats, size_cats, year_cats] ## OLS ## f = 'IsBadBuy ~ all_vars' y,X = patsy.dmatrices(f, training_clean, return_type='dataframe') print smf.OLS(y,X).fit().summary() f = 'IsBadBuy ~ continuous_vars' y,X = patsy.dmatrices(f, training, return_type='dataframe') print smf.OLS(y,X).fit().summary() continuous_vars = ['VehicleAge','VehOdo','VehBCost','discount'] tempset = training[training.MMRAcquisitionAuctionAveragePrice.notnull()] #define new variables tempset['discount'] = tempset['VehBCost']-tempset['MMRAcquisitionAuctionAveragePrice'] regression_OLS=sm.OLS(tempset.IsBadBuy, tempset[continuous_vars],hasconst=False).fit() print regression_OLS.summary()
#summary(anova1) # better option - "aov" #anova2 <- aov(Fare ~ Embarked, data = rTrain) #anova2 #summary(anova2) # ====== end of R Code ========== #============================================================================== # Multiple Linear Regression #============================================================================== # Assumptions... and what do they mean? Rpdata = pd.read_csv(os.path.join(rawDataFld, "alr4_Rpdata.csv")) import statsmodels.api as sm import seaborn as sns lmmodel = sm.OLS(Rpdata["y"], Rpdata[["x1", "x2", "x3", "x4", "x5", "x6"]]).fit() # does this look like a good model (numerically speaking)? lmmodel.summary() # how about we "dig deeper"? - residual plots lmmodel_fitted_y = lmmodel.fittedvalues lmmodel_residuals = lmmodel.resid sns.residplot(lmmodel_fitted_y, 'y', data=Rpdata, lowess=True) # Uh-oh! The residuals are not random! Or are they? What makes them non-random? # Participate in the online discussion on randomness! # HW follow up with comments on what is randomness, and how is this relevant to our analysis?
mean_fake.append(mean_tau) plt.scatter(this_unit['tau'], mean_tau) mean_fake = np.array(mean_fake).reshape(-1, 1) mean_real = np.array(mean_real).reshape(-1, 1) regr = linear_model.LinearRegression() regr.fit(mean_real, mean_fake) fake_tau_pred = regr.predict(mean_real) r2 = r2_score(mean_fake, fake_tau_pred) import statsmodels.api as sm mod = sm.OLS(mean_fake, mean_real) fii = mod.fit() p_values = fii.summary2().tables[1]['P>|t|'] plt.plot(mean_real, fake_tau_pred, color='#00b2ee', label='regression') plt.plot(range(0, 1000), range(0, 1000), '-', color='#f00c93', label='identity') plt.title('Monkey amygdala \n 1000 iterations') plt.xlabel('"real" tau (ms)') plt.ylabel('"fake" tau (ms)') plt.text(0, 800, '$R^2 = %.2f $ \n $p < 10^{-30}$' % (r2)) plt.legend()
def diagnostic_plots(X, y, model_fit=None): """ Function to reproduce the 4 base plots of an OLS model in R. https://robert-alvarez.github.io/2018-06-04-diagnostic_plots/ --- Inputs: X: A numpy array or pandas dataframe of the features to use in building the linear regression model y: A numpy array or pandas series/dataframe of the target variable of the linear regression model model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be generated from X, y """ def _graph(formula, x_range, label=None): """ Helper function for plotting cook's distance lines """ x = x_range y = formula(x) plt.plot(x, y, label=label, lw=1, ls='--', color='red') if not model_fit: model_fit = sm.OLS(y, sm.add_constant(X)).fit() # create dataframe from X, y for easier plot handling dataframe = pd.concat([X, y], axis=1) # model values model_fitted_y = model_fit.fittedvalues # model residuals model_residuals = model_fit.resid # normalized residuals model_norm_residuals = model_fit.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals)) # absolute residuals model_abs_resid = np.abs(model_residuals) # leverage, from statsmodels internals model_leverage = model_fit.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals model_cooks = model_fit.get_influence().cooks_distance[0] plot_lm_1 = plt.figure() plot_lm_1.axes[0] = sns.residplot(model_fitted_y, dataframe.columns[-1], data=dataframe, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_1.axes[0].set_title('Residuals vs Fitted') plot_lm_1.axes[0].set_xlabel('Fitted values') plot_lm_1.axes[0].set_ylabel('Residuals') # annotations abs_resid = model_abs_resid.sort_values(ascending=False) abs_resid_top_3 = abs_resid[:3] for i in abs_resid_top_3.index: plot_lm_1.axes[0].annotate(i, xy=(model_fitted_y[i], model_residuals[i])) QQ = ProbPlot(model_norm_residuals) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i])) plot_lm_3 = plt.figure() plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5) sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_3.axes[0].set_title('Scale-Location') plot_lm_3.axes[0].set_xlabel('Fitted values') plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$') # annotations abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0) abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3] for i in abs_sq_norm_resid_top_3: plot_lm_3.axes[0].annotate(i, xy=(model_fitted_y[i], model_norm_residuals_abs_sqrt[i])) plot_lm_4 = plt.figure() plt.scatter(model_leverage, model_norm_residuals, alpha=0.5) sns.regplot(model_leverage, model_norm_residuals, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01) plot_lm_4.axes[0].set_ylim(-3, 5) plot_lm_4.axes[0].set_title('Residuals vs Leverage') plot_lm_4.axes[0].set_xlabel('Leverage') plot_lm_4.axes[0].set_ylabel('Standardized Residuals') # annotations leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3] for i in leverage_top_3: plot_lm_4.axes[0].annotate(i, xy=(model_leverage[i], model_norm_residuals[i])) p = len(model_fit.params) # number of model parameters _graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), np.linspace(0.001, max(model_leverage), 50), 'Cook\'s distance') # 0.5 line _graph(lambda x: np.sqrt((1 * p * (1 - x)) / x), np.linspace(0.001, max(model_leverage), 50)) # 1 line plot_lm_4.legend(loc='upper right')
(grp2_df_rev.corr()).iloc[:,0] plt.plot(grp2_df_rev['meter_reading']) plt.plot(grp2_df_rev['wind_direction']) plt.legend(['meter_reading','wind_direction']) plt.show() import statsmodels.api as sm import statsmodels # OK THERE IS NO TREND HERE (ON BOTH SERIES ) mod1_grp2=sm.OLS(np.array(grp2_df_rev['meter_reading']['mean']),statsmodels.tools.tools.add_constant(np.array(grp2_df_rev['wind_direction']['mean'])),hasconst=False) mod1_grp2.fit().summary() results=mod1_grp2.fit() plt.hist(results.resid) plt.show() adfuller(results.resid) # OBVIOUSLY RESIDUALS WITHOUT TREND mod2_grp2=sm.OLS(np.array(grp2_df_rev['meter_reading']['mean']),np.array(grp2_df_rev['wind_direction']['mean']),hasconst=False) mod2_grp2.fit().summary() results2=mod2_grp2.fit() plt.hist(results2.resid)
# Use the "statsmodels" package of econometric commands with "sm" shortcut import statsmodels as sm # Define a model regressing y on X using OLS from the sm module ols_model = sm.OLS(y, X) # Estimate the model using the .fit attribute ols_results = ols_model.fit() # Report the results to the terminal print(ols_results.summary())