def test_wrong_len_xname(reset_randomstate): y = np.random.randn(100) x = np.random.randn(100, 2) res = OLS(y, x).fit() with pytest.raises(ValueError): res.summary(xname=['x1']) with pytest.raises(ValueError): res.summary(xname=['x1', 'x2', 'x3'])
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = OLS(y1, x).fit() maxVar = max(regressor_OLS.pvalues) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) regressor_OLS.summary() return x
def test_OLSsummary_rsquared_label(self): # Check that the "uncentered" label is correctly added after rsquared x = [1, 5, 7, 3, 5, 2, 5, 3] y = [6, 4, 2, 7, 4, 9, 10, 2] reg_with_constant = OLS(y, x, hasconst=True).fit() assert 'R-squared:' in str(reg_with_constant.summary2()) assert 'R-squared:' in str(reg_with_constant.summary()) reg_without_constant = OLS(y, x, hasconst=False).fit() assert 'R-squared (uncentered):' in str(reg_without_constant.summary2()) assert 'R-squared (uncentered):' in str(reg_without_constant.summary())
def test_OLSsummary_rsquared_label(self): # Check that the "uncentered" label is correctly added after rsquared x = [1, 5, 7, 3, 5, 2, 5, 3] y = [6, 4, 2, 7, 4, 9, 10, 2] reg_with_constant = OLS(y, x, hasconst=True).fit() assert 'R-squared:' in str(reg_with_constant.summary2()) assert 'R-squared:' in str(reg_with_constant.summary()) reg_without_constant = OLS(y, x, hasconst=False).fit() assert 'R-squared (uncentered):' in str( reg_without_constant.summary2()) assert 'R-squared (uncentered):' in str(reg_without_constant.summary())
def test_regression_with_tuples(self): i = pandas.Series([1, 2, 3, 4] * 10, name="i") y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y") x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x") df = pandas.DataFrame(index=i.index) df = df.join(i) endo = df.join(y) exo = df.join(x) endo_groups = endo.groupby("i") exo_groups = exo.groupby("i") exo_df = exo_groups.agg([np.sum, np.max]) endo_df = endo_groups.agg([np.sum, np.max]) reg = OLS(exo_df[[("x", "sum")]], endo_df).fit() interesting_lines = [] import warnings with warnings.catch_warnings(): # Catch ominormal warning, not interesting here warnings.simplefilter("ignore") for line in str(reg.summary()).splitlines(): if "_" in line: interesting_lines.append(line[:38]) desired = ["Dep. Variable: x_sum ", "y_sum 1.4595 0.209 ", "y_amax 0.2432 0.035 "] assert_equal(sorted(desired), sorted(interesting_lines))
def test_regression_with_tuples(self): i = pandas.Series([1, 2, 3, 4] * 10, name="i") y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y") x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x") df = pandas.DataFrame(index=i.index) df = df.join(i) endo = df.join(y) exo = df.join(x) endo_groups = endo.groupby(("i", )) exo_groups = exo.groupby(("i", )) exo_Df = exo_groups.agg([np.sum, np.max]) endo_Df = endo_groups.agg([np.sum, np.max]) reg = OLS(exo_Df[[("x", "sum")]], endo_Df).fit() interesting_lines = [] import warnings with warnings.catch_warnings(): # Catch ominormal warning, not interesting here warnings.simplefilter("ignore") for line in str(reg.summary()).splitlines(): if "('" in line: interesting_lines.append(line[:38]) desired = [ "Dep. Variable: ('x', 'sum') ", "('y', 'sum') 1.4595 0.209 ", "('y', 'amax') 0.2432 0.035 " ] self.assertEqual(sorted(desired), sorted(interesting_lines))
def test_ols_summary_rsquared_label(): # Check that the "uncentered" label is correctly added after rsquared x = [1, 5, 7, 3, 5, 2, 5, 3] y = [6, 4, 2, 7, 4, 9, 10, 2] reg_with_constant = OLS(y, add_constant(x)).fit() r2_str = 'R-squared:' with pytest.warns(UserWarning): assert r2_str in str(reg_with_constant.summary2()) with pytest.warns(UserWarning): assert r2_str in str(reg_with_constant.summary()) reg_without_constant = OLS(y, x, hasconst=False).fit() r2_str = 'R-squared (uncentered):' with pytest.warns(UserWarning): assert r2_str in str(reg_without_constant.summary2()) with pytest.warns(UserWarning): assert r2_str in str(reg_without_constant.summary())
def test_fvalue_only_constant(): # if only constant in model, return nan see #3642 nobs = 20 np.random.seed(2) x = np.ones(nobs) y = np.random.randn(nobs) from statsmodels.regression.linear_model import OLS, WLS res = OLS(y, x).fit(cov_type='hac', cov_kwds={'maxlags': 3}) assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() res = WLS(y, x).fit(cov_type='HC1') assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary()
def test_fvalue_implicit_constant(): nobs = 100 np.random.seed(2) x = np.random.randn(nobs, 1) x = ((x > 0) == [True, False]).astype(int) y = x.sum(1) + np.random.randn(nobs) from statsmodels.regression.linear_model import OLS, WLS res = OLS(y, x).fit(cov_type='HC1') assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() res = WLS(y, x).fit(cov_type='HC1') assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary()
def test_summary_as_latex(): # GH#734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog res = OLS(y, X).fit() with pytest.warns(UserWarning): table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\\{Date:\\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\\{Time:\\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lcccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center} Warnings: \\newline [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline strong multicollinearity or other numerical problems.""" assert_equal(table, expected)
def create_linear_model(X_train, X_test, Y_train, Y_test): ''' TODO... - Predict the wine quality using the test set and compare the accuracy to the actual quality. Comment. - Print the parameter estimates and their 95% confidence intervals in a single table. (Suggest using confint()), and cbind() ''' X_train = add_constant(X_train) regressionResult = OLS(Y_train, X_train).fit() print(regressionResult.summary()) # Print various attributes of the OLS fitted model # print("R Squared: {}".format(regressionResult.rsquared)) # print("SSE: {}".format(regressionResult.ess)) # print("SSR: {}".format(regressionResult.ssr)) # print("Residual MSE: {}".format(regressionResult.mse_resid)) # print("Total MSE: {}".format(regressionResult.mse_total)) # print("Model MSE: {}".format(regressionResult.mse_model)) # print("F-Value: {}".format(regressionResult.mse_model/regressionResult.mse_resid)) # print("NOBS: {}".format(regressionResult.nobs)) # print("Centered TSS: {}".format(regressionResult.centered_tss)) # print("Uncentered TSS: {}".format(regressionResult.uncentered_tss)) # print("DF Model: {}".format(regressionResult.df_model)) # print("DF Resid: {}".format(regressionResult.df_resid)) # print("Standard Errors: {}".format(regressionResult.bse)) print("Confidence: {}".format(regressionResult.conf_int())) predictions = regressionResult.predict(X_train) nobs, p = X_train.shape eaic = extractAIC(nobs, p, Y_train, predictions) print("Extract AIC: {}".format(eaic)) params = regressionResult.params # n, p = X_test.shape # X_test = add_constant(X_test) # predictions = X_test.dot(params).reshape(n,1) # num_matches = 0 # for i in range(len(Y_test)): # p = int(round(predictions[i][0], 0)) # is_match = (Y_test[i] == p) # if is_match: # num_matches += 1 # print("Actual: {}, Predictions: {}... Match: {}".format(Y_test[i], p, is_match)) # print("Number of matches: {}, Total number of Instances: {}".format(num_matches, n)) # print("Percent correct guesses: {}%".format(round((num_matches/n)*100, 3))) return params
def test_summary(): # test 734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog with warnings.catch_warnings(record=True): res = OLS(y, X).fit() table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\{Date:\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\{Time:\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center}""" assert_equal(table, expected)
def test_OLSsummary(self): # Test that latex output of regular OLS output still contains # multiple tables x = [1,5,7,3,5] x = add_constant(x) y1 = [6,4,2,7,4] reg1 = OLS(y1,x).fit() with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = reg1.summary().as_latex() string_to_find = r'''\end{tabular} \begin{tabular}''' result = string_to_find in actual assert(result is True)
def test_OLSsummary(self): # Test that latex output of regular OLS output still contains # multiple tables x = [1, 5, 7, 3, 5] x = add_constant(x) y1 = [6, 4, 2, 7, 4] reg1 = OLS(y1, x).fit() with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = reg1.summary().as_latex() string_to_find = r'''\end{tabular} \begin{tabular}''' result = string_to_find in actual assert (result is True)
def split(self, X, Y, seed=None, max_splits=1000): """ splitting function Parameters ----------- X: design matrix (not actually needed but taken for consistency) Y: outcome variable seed: random seed (default: None, uses system time) max_splits: int, maximum number of splits to try before failing """ np.random.seed(seed) nsubs = len(Y) # cycle through until we find a split that is good enough runctr = 0 best_pval = 0 while True: runctr += 1 cv = KFold(n_splits=self.nfolds, shuffle=True) idx = np.zeros((nsubs, self.nfolds)) # this is the design matrix folds = [] ctr = 0 # create design matrix for anova across folds for train, test in cv.split(Y): idx[test, ctr] = 1 folds.append([train, test]) ctr += 1 # fit anova model, comparing means of Y across folds lm_y = OLS(Y - np.mean(Y), idx).fit() if lm_y.f_pvalue > best_pval: best_pval = lm_y.f_pvalue best_folds = folds if lm_y.f_pvalue > self.pthresh: if self.verbose: print(lm_y.summary()) return iter(folds) if runctr > max_splits: print('no sufficient split found, returning best (p=%f)' % best_pval) # noqa return iter(best_folds)
def test__repr_latex_(self): desired = r''' \begin{center} \begin{tabular}{lcccccc} \toprule & \textbf{coef} & \textbf{std err} & \textbf{t} & \textbf{P$> |$t$|$} & \textbf{[0.025} & \textbf{0.975]} \\ \midrule \textbf{const} & 7.2248 & 0.866 & 8.346 & 0.000 & 5.406 & 9.044 \\ \textbf{x1} & -0.6609 & 0.177 & -3.736 & 0.002 & -1.033 & -0.289 \\ \bottomrule \end{tabular} \end{center} ''' x = [1, 5, 7, 3, 5, 5, 8, 3, 3, 4, 6, 4, 2, 7, 4, 2, 1, 9, 2, 6] x = add_constant(x) y = [6, 4, 2, 7, 4, 2, 1, 9, 2, 6, 1, 5, 7, 3, 5, 5, 8, 3, 3, 4] reg = OLS(y, x).fit() actual = reg.summary().tables[1]._repr_latex_() actual = '\n%s\n' % actual assert_equal(actual, desired)
def split(self, X, Y, max_splits=1000): """ - we don't actually need X but we take it for consistency """ nsubs = len(Y) # cycle through until we find a split that is good enough runctr = 0 best_pval = 0. while 1: runctr += 1 cv = KFold(n_splits=self.nfolds, shuffle=True) idx = N.zeros((nsubs, self.nfolds)) # this is the design matrix folds = [] ctr = 0 for train, test in cv.split(Y): idx[test, ctr] = 1 folds.append([train, test]) ctr += 1 lm_y = OLS(Y - N.mean(Y), idx).fit() if lm_y.f_pvalue > best_pval: best_pval = lm_y.f_pvalue best_folds = folds if lm_y.f_pvalue > self.pthresh: if self.verbose: print(lm_y.summary()) return iter(folds) if runctr > max_splits: print('no sufficient split found, returning best (p=%f)' % best_pval) return iter(best_folds)
def regression(aspects, dataset): asps = list(set(dataset.columns).intersection(aspects)) asps.sort() aspsMP = list() for asp in asps: minus = asp+'_minus' dataset[minus] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == -1 else 0, axis=1) aspsMP.append(minus) plus = asp+'_plus' dataset[plus] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == 1 else 0, axis=1) aspsMP.append(plus) # overall = 'a_'+asp # dataset[overall] = dataset.apply(lambda x: x[asp]*x[asp+'sent'], axis=1) # aspsMP.append(overall) neutral = asp+'_neutral' dataset[neutral] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == 0 else 0, axis=1) aspsMP.append(neutral) # aspsMP.append(asp+'sent') # MINUS # PLUS aspsMP.sort() dataset['intercept'] = np.ones(len(dataset)) aspsMP = ['intercept'] + aspsMP # print(len(aspects),len(asps)) model = OLS(dataset['stars'], dataset[aspsMP]).fit() # model.summary # print(model.params) # print(model.pvalues) return model.summary()
def split(self,X,Y,max_splits=1000): """ - we don't actually need X but we take it for consistency """ nsubs=len(Y) # cycle through until we find a split that is good enough runctr=0 best_pval=0. while 1: runctr+=1 cv=KFold(n_splits=self.nfolds,shuffle=True) idx=N.zeros((nsubs,self.nfolds)) # this is the design matrix folds=[] ctr=0 for train,test in cv.split(Y): idx[test,ctr]=1 folds.append([train,test]) ctr+=1 lm_y=OLS(Y-N.mean(Y),idx).fit() if lm_y.f_pvalue>best_pval: best_pval=lm_y.f_pvalue best_folds=folds if lm_y.f_pvalue>self.pthresh: if self.verbose: print(lm_y.summary()) return iter(folds) if runctr>max_splits: print('no sufficient split found, returning best (p=%f)'%best_pval) return iter(best_folds)
def test_regression_with_tuples(self): i = pandas.Series( [1,2,3,4]*10 , name="i") y = pandas.Series( [1,2,3,4,5]*8, name="y") x = pandas.Series( [1,2,3,4,5,6,7,8]*5, name="x") df = pandas.DataFrame( index=i.index ) df = df.join( i ) endo = df.join( y ) exo = df.join( x ) endo_groups = endo.groupby( ("i",) ) exo_groups = exo.groupby( ("i",) ) exo_Df = exo_groups.agg( [np.sum, np.max] ) endo_Df = endo_groups.agg( [np.sum, np.max] ) reg = OLS(exo_Df[[("x", "sum")]],endo_Df).fit() interesting_lines = [] for line in str( reg.summary() ).splitlines(): if "('" in line: interesting_lines.append( line[:38] ) desired = ["Dep. Variable: ('x', 'sum') ", "('y', 'sum') 1.4595 0.209 ", "('y', 'amax') 0.2432 0.035 "] self.assertEqual( desired, interesting_lines )
import numpy as np from statsmodels.regression.linear_model import OLS, GLSAR from statsmodels.tools.tools import add_constant from statsmodels.datasets import macrodata import statsmodels.regression.tests.results.results_macro_ols_robust as res d2 = macrodata.load().data g_gdp = 400 * np.diff(np.log(d2['realgdp'])) g_inv = 400 * np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False) res_olsg = OLS(g_inv, exogg).fit() print res_olsg.summary() res_hc0 = res_olsg.get_robustcov_results('HC1') print '\n\n' print res_hc0.summary() print '\n\n' res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True) print res_hac4.summary() print '\n\n' tt = res_hac4.t_test(np.eye(len(res_hac4.params))) print tt.summary() print '\n\n' print tt.summary_frame() res_hac4.use_t = False print '\n\n'
""" Run this after you've run benchmarks.py to get the /tmp/runtimes.csv data """ import pandas as pd from statsmodels.regression.linear_model import OLS df = pd.read_csv('/tmp/runtimes.csv') df['constant'] = 1 df['resolution_sq'] = df['resolution']**2 quad_model = OLS(df['avg_time'], df[['resolution_sq', 'resolution', 'constant']]).fit() quad_model.summary()
import numpy as np from statsmodels.regression.linear_model import OLS, GLSAR from statsmodels.tools.tools import add_constant from statsmodels.datasets import macrodata import statsmodels.regression.tests.results.results_macro_ols_robust as res d2 = macrodata.load().data g_gdp = 400*np.diff(np.log(d2['realgdp'])) g_inv = 400*np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False) res_olsg = OLS(g_inv, exogg).fit() print res_olsg.summary() res_hc0 = res_olsg.get_robustcov_results('HC1') print '\n\n' print res_hc0.summary() print '\n\n' res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True) print res_hac4.summary() print '\n\n' tt = res_hac4.t_test(np.eye(len(res_hac4.params))) print tt.summary() print '\n\n' print tt.summary_frame() res_hac4.use_t = False
y_train=standardscaler.fit_transform(y_train) y_test=standardscaler.transform(y_test)""" #multiple linear algo from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor = regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) #backward elimination method from statsmodels.regression.linear_model import OLS x = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1) x_opt = x[:, :6] regressor_OLS = OLS(endog=y, exog=x_opt).fit() regressor_OLS.summary() x_opt = x[:, [0, 3]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() regressor_OLS.summary() #splitting dataset into train and test dataset from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x_opt, y, test_size=0.2, random_state=0) #multiple linear algo from sklearn.linear_model import LinearRegression regressor = LinearRegression()
def fit_ols(self): self.data_lag.loc[self.data_lag.fecha <= "2020-04-04", "days"] = 30 ts_ols = OLS( self.data_lag.iloc[:-1, ].fallecimientos, self.data_lag.iloc[:-1, ].drop(["fecha", "fallecimientos"], axis=1)).fit() sum = ts_ols.summary() predictions = pd.DataFrame( ts_ols.predict(self.forecast.drop("fecha", axis=1))) e = pd.DataFrame({ "Modelo": "OLS", "Predicción de hoy": [predictions.iloc[0, 0]], "Error de hoy": [ abs(predictions.iloc[0, 0] - self.dt.loc[len(self.dt) - 1, "fallecimientos"]) ] }) predictions["fecha"] = self.dt.loc[len(self.dt) - 1, "fecha"] predictions.columns = ["fallecimientos", "fecha"] predictions.reset_index(drop=True, inplace=True) for i in range(len(self.forecast)): c = 0 c += i predictions.loc[i, "fecha"] = predictions.fecha[i] + timedelta(days=c) new = pd.concat( (self.dt[["fallecimientos", "fecha"]], predictions.iloc[1:, :]), axis=0) new["Predicciones"] = np.where( new.fecha <= self.dt.loc[len(self.dt) - 1, "fecha"], "Real", "Pred") fig = px.bar( new, x="fecha", y="fallecimientos", color="Predicciones", ) # predictions.columns =["Predicciones_Fallecimientos", "fecha"] # # load = str(self.dt.loc[len(self.dt)-1, "fecha"] - timedelta(days=1)) # load = load[0:10] + ".pkl" # # with open(load, "rb") as file: # historic = pickle.load(file) # predictions["Error"] = 0 # p=pd.concat([predictions.reset_index(drop=True), historic], ignore_index=True) # p = p.loc[p.fecha <= self.dt.loc[len(self.dt)-1, "fecha"],:] # p.reset_index(drop=True, inplace=True) # for i in range(0,len(p)): # if self.dt.loc[len(self.dt)-1,"fecha"] == p.loc[i,"fecha"]: # p.loc[i,"Error"] = np.sqrt((self.dt.loc[len(self.dt)-1,"fallecimientos"] - p.loc[i,"Predicciones_Fallecimientos"])**2) # # save = str(self.dt.loc[len(self.dt)-1, "fecha"]) # save = save[0:10] + ".pkl" # # with open(save, "wb") as file: # pickle.dump(p, file) return e, fig, sum
import numpy as np from statsmodels.regression.linear_model import OLS, GLSAR from statsmodels.tools.tools import add_constant from statsmodels.datasets import macrodata import statsmodels.regression.tests.results.results_macro_ols_robust as res d2 = macrodata.load(as_pandas=False).data g_gdp = 400*np.diff(np.log(d2['realgdp'])) g_inv = 400*np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False) res_olsg = OLS(g_inv, exogg).fit() print(res_olsg.summary()) res_hc0 = res_olsg.get_robustcov_results('HC1') print('\n\n') print(res_hc0.summary()) print('\n\n') res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True) print(res_hac4.summary()) print('\n\n') tt = res_hac4.t_test(np.eye(len(res_hac4.params))) print(tt.summary()) print('\n\n') print(tt.summary_frame()) res_hac4.use_t = False
def ols_sm(X_train, y_train, X_test): X_train = sm.add_constant( X_train) # adds col of ones for intercept coefficient in OLS model ols = OLS(y_train, X_train).fit() # with open('ols_model_summary.csv', 'w') as f: # f.write(ols.summary().as_csv()) with open('ols_model_summary.txt', 'w') as f: f.write(ols.summary().as_text()) # Plot True vs Predicted values to examine if linear model is a good fit fig = plt.figure(figsize=(12, 8)) X_test = sm.add_constant(X_test) plt.scatter(y_test, ols.predict(X_test)) plt.xlabel('True values') plt.ylabel('Predicted values') plt.title('True vs Predicted values') plt.show() plt.close() # Add quadratic term to X or take log of y to improve # Discern if a linear relationship exists with partial regression plots fig = plt.figure(figsize=(12, 8)) fig = sm.graphics.plot_partregress_grid(ols, fig=fig) plt.title('Partial Regression Plots') plt.show() plt.close() # Identify outliers and high leverage points # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev). # Temporarily remove these from your data set and re-run your model. # Do your model metrics improve considerably? Does this give you cause for more confidence in your model? # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence). fig, ax = plt.subplots(figsize=(12, 8)) fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks") plt.show() fig, ax = plt.subplots(figsize=(8, 6)) fig = sm.graphics.plot_leverage_resid2(ols, ax=ax) plt.show() plt.close() # Confirm homoscedasticity (i.e., constant variance of residual terms) # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space. studentized_residuals = ols.outlier_test()[:, 0] y_pred = ols.fittedvalues fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(y_pred, studentized_residuals) ax.axhline(y=0.0, color='k', ls='--') ax.set_xlabel('Predicted y') ax.set_ylabel('Studentized Residuals') plt.show() plt.close() # Test if residuals are normally distributed in QQ plot # plots quantile of the normal distribution against studentized residuals # if sample quantiles are normally distributed, the dots will align with 45 deg line fig, ax = plt.subplots() sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax) plt.show() plt.close() # Find influencial points in data # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out threshold = 2. / len(X_train)**.5 infl = ols.get_influence() df = pd.DataFrame(infl.summary_frame().filter(regex="dfb")) inf = df[df > threshold].dropna(axis=0, how='all') print('Influencial points:\n', inf)
"""Try a Multinomial LogisticRegression""" from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge from statsmodels.discrete.discrete_model import MNLogit from statsmodels.regression.linear_model import OLS # we use the full df for the regression because we want to weight results by the # existence of different ads in different neighborhoods, not just unique addresses X = df[["black_proportion","log_income","asian_proportion","latinx_proportion","log_price"]] y = df.white_proportion df_tmp = df.copy() df_tmp[list(range(30))] = df_tmp[list(range(30))].where(df_tmp[list(range(30))]>.1,0) topic_0 + topic_7 + topic_8 + topic_9 + topic_12 + topic_14 + topic_16 + topic_17+ topic_20 + topic_23 + topic_24 + topic_25 + topic_28 X = df[[str(x) for x in [0,7,8,9,12,14,16,17,20,23,24,25,28]]+["black_proportion","log_income","log_price","total_RE"]] y = np.where(df['white_proportion']>np.median(df['white_proportion']),1,0) y= df['income'] OLR = OLS(y,X).fit() OLR.summary() OLR.predict(exog=X) df_full_results.params.sort_values() df_results.params.sort_values() df_results.summary() EN = ElasticNet(alpha = .02, l1_ratio=.001) EN.fit(X,y) EN.score(X,y) EN.predict(X) LinR = LinearRegression() LinR.fit(X,y) LinR.score(X,y) RR = Ridge() RR.fit(X,y).score(X,y)
columns=encoder.get_feature_names(cat_names)) X = X.select_dtypes(exclude=['O', 'category']).join(df_cat) return X # Data prep data_file = '.../cohort_analysis.csv' df = feature_extraction(data_file).drop('Month', axis=1) X_train, X_test, y_train, y_test = get_the_set(df=df, target_variable='y') encoder = fit_encoder(X_train) X_train = run_scaling(X_train) X_train = encode_categorical_variables(X_train, encoder) X_test = run_scaling(X_test) X_test = encode_categorical_variables(X_test, encoder) # Model model = OLS( y_train, add_constant( X_train.drop(['Week_1', 'Week_3', 'Cohort_active_users'], axis=1))).fit() model.summary() # Test the assumption of Linear Regression tester = Assumptions.Assumption_Tester_OLS(X_train, y_train) tester.run_all()
def summary_OLS(X, y): scaler = StandardScaler().fit(X) X = scaler.transform(X) ols = OLS(y, X).fit() print(ols.summary())
df1 = df_dat.groupby(['gr_span']) df1['testscr'].describe() df1.describe() df1.mean() df1 = df.groupby(['gr_span', 'county']) df1.describe() #01——线性回归模型拟合 reg = linear_model.LinearRegression() x = pd.DataFrame(df_dat.iloc[:, [2, 3, 4, 5, 6, 8, 9, 10, 11, 12]]) x['cons'] = 1 #加入常数项 y = df_dat['testscr'] regres = OLS(y, x, missing='drop').fit() regres.summary() #02--主成分分析 x1 = pd.DataFrame(df_dat.iloc[:, [2, 3, 4, 5, 6, 8, 9, 10, 11, 12]]) x_scaled = preprocessing.scale(x1) pca = PCA(n_components=3) #相关系数矩阵提取主成分 pca.fit(x_scaled) pca_components = pd.DataFrame(pca.components_) #主成分系数矩阵 pca_components.to_csv('C:/Users/ASUS/Desktop/统计计算实验课/jietu/成分系数矩阵.csv') pca.explained_variance_ #特征值 pca.explained_variance_ratio_ #解释方差比 zdf = pd.DataFrame(x_scaled) zi = pd.DataFrame(pca.transform(x_scaled), columns=['z1', 'z2', 'z3']) Zdf = pd.concat([zdf, zi], axis=1) Zdf_describe = pd.DataFrame(Zdf.describe()) Zdf_describe.to_csv('C:/Users/ASUS/Desktop/统计计算实验课/jietu/成分矩阵描述.csv')
import pandas as pd from statsmodels.regression.linear_model import OLS import numpy as np np.set_printoptions(suppress=True) data = pd.read_csv('Dataset/dataset.csv') X = data["Head Size(cm^3)"].values y = data["Brain Weight(grams)"].values X = np.array(X, dtype='float64') y = np.array(y, dtype='float64') y = np.reshape(y, (len(y), 1)) X = np.column_stack([np.ones(len(X)), X]) # Implement the statsmodel function res = OLS(y, X).fit() # Theta values theta = res.params print(theta) # prediction ols_pred = res.predict() print(res.summary())
data_x, data_y, test_size=0.2, random_state=0) # Model Creation and Fitting multi_linear_model = LinearRegression() multi_linear_model.fit(trainset_data_x, trainset_data_y) # Predictions prediction_y = multi_linear_model.predict(testset_data_x) # Introduction to Backward Elimination data_x = np.append(values=data_x, arr=np.ones((50, 1)).astype(int), axis=1) # First Case optimal_x = np.array(data_x[:, [0, 1, 2, 3, 4, 5]], dtype=float) ols_model = OLS(endog=data_y, exog=optimal_x).fit() ols_model.summary() # Second Case optimal_x = np.array(data_x[:, [0, 1, 3, 4, 5]], dtype=float) ols_model = OLS(endog=data_y, exog=optimal_x).fit() ols_model.summary() # Third Case optimal_x = np.array(data_x[:, [0, 3, 4, 5]], dtype=float) ols_model = OLS(endog=data_y, exog=optimal_x).fit() ols_model.summary() # Fourth Case optimal_x = np.array(data_x[:, [0, 3, 5]], dtype=float) ols_model = OLS(endog=data_y, exog=optimal_x).fit() ols_model.summary()
#since there is explicit non-linear in this model, we have to add some non-linear covariates in it. # partial residual plot # Which attempts to show how covariate is related to dependent variable # if we control for the effects of all other covariates # partial residual plots look acceptable. sns.jointplot(regr_1.params.bmi * X.bmi + regr_1.resid, X.bmi) sns.jointplot(regr_1.params.age * X.age + regr_1.resid, X.age) ####################### ####model selection#### ####################### #original model regr_1.summary() #the first issue we need to solve is residual's dependent problem. #try NO1: add an interactive covariate smoker:bmi X_2 = X.iloc[:,:] X_2['sm_bm'] = X_2.smoker * X_2.bmi regr_test = OLS(y, add_constant(X_2)).fit() regr_test.summary() # which certainly improve the performance of the model #try NO2: add an interactive covariate smoker:age X_2['sm_ag'] = X_2.smoker*X_2.age regr_test = OLS(y, add_constant(X_2)).fit() regr_test.summary() # which increase the performance of the model significantly
y = df.iloc[:, 4].values # data encoding label_encode = LabelEncoder() x[:, 0] = label_encode.fit_transform(x[:, 0]) one_hot_encode = OneHotEncoder(categorical_features=[0]) x = one_hot_encode.fit_transform(x).toarray() x[:, 0] = numpy.ones(x.shape[0]) # standardising the cols standard_scalar = StandardScaler() x[:, 2:] = standard_scalar.fit_transform(x[:, 2:]) # to know which cols to remove ols = OLS(endog=y, exog=x).fit() print(ols.summary()) # dropping col that are having high p-values and the constant col x = numpy.delete(x, [0, 1], axis=1) # data splitting x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.25) # SVM classifier = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42) classifier.fit(x_train, y_train)