def __init__(self, df, target, predictors, ids, cluster_ids=[], drop_singletons=True): """ Args: target (string): name of target variable predictors (list of strings): names of predictors ids (list of strings): names of variables to be absorbed df (pandas Dataframe): dataframe containing referenced data which includes target, predictors and ids """ self.df = df self.algo = pyhdfe.create(ids=get_np_columns(df, ids), cluster_ids=get_np_columns(df, cluster_ids), drop_singletons=drop_singletons, degrees_method='pairwise') self.all_names = [target] + predictors self.residualized = self.algo.residualize( get_np_columns(df, [target] + predictors + cluster_ids)) self.formula = target + '~' + predictors[0] for name in predictors[1:]: self.formula = self.formula + '+' + name self.formula = self.formula + '-1' df_residualized = pd.DataFrame() for i, name in enumerate(self.all_names): df_residualized[name] = self.residualized[:, i] y, X = dmatrices(self.formula, data=df_residualized, return_type='dataframe') self.model = sm.OLS(y, X) if cluster_ids == []: self.model.df_resid = self.residualized.shape[0] - len( predictors) - self.algo.degrees else: clusters = get_np_columns( df, cluster_ids)[~self.algo._singleton_indices] min_cluster_count = np.unique(clusters[:, 0]).shape[0] for i in range(1, clusters.shape[1]): current_count = np.unique(clusters[:, i]).shape[0] if current_count < min_cluster_count: min_cluster_count = current_count self.model.df_resid = min_cluster_count - len(predictors)
def fit(self, **kwargs): groups_dict = {} if 'cov_type' in kwargs: groups = get_np_columns(self.df, kwargs['groups']) cleaned_groups = groups[np.logical_not( self.algo._singleton_indices)] temp = self.model.fit() return self.model.fit(cov_type='cluster', cov_kwds={'groups': cleaned_groups}) else: return self.model.fit(**kwargs)
#results = smf.ols(formula='wks_work~1', data=df).fit() # y = np.asarray([[-100, 100, -100, 1234]]).T # X = np.asarray([[1,2 ,3, 4], [1, 1, 1, 1]]).T # results = sm.OLS(y, X).fit() model = smf.ols(formula='ttl_exp~wks_ue', data=df) results = model.fit() import pdb pdb.set_trace() print(results.predict()[-10:]) print(results.summary()) #results = smf.wls(formula='wks_work~ttl_exp', data=df, weights=np.linspace(1, 13452, 13452)).fit() model = smf.glm(formula='ttl_exp~wks_ue', freq_weights=np.linspace(1, 13452, 13452), data=df) results = model.fit() print(results.summary()) lin_pred = model.mu target = get_np_columns(df, ['ttl_exp']).squeeze() resid = target - lin_pred mse_resid = np.sum(resid**2) / 90484876 explained_sum_of_squares = np.sum( (target - np.mean(target))**2) - np.sum(resid**2) #that 1 there is df.model mse_model = explained_sum_of_squares / 1 F = mse_model / mse_resid print(F) import pdb pdb.set_trace() #print("df_model", results.df_model) #print("df_resid", results.df_resid)
# ######################################################################### PYHDFE TEST # for a in list(enumerate(list(df))): # print(a) df_np = df.to_numpy() # just a sanity check of straight forward regression #print("ln_wage ~ hours_log") #model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log'])) #results = model.fit() #print(results.summary()) algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False), degrees_method='pairwise') residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False)) print(algo.degrees) import pdb; pdb.set_trace() #model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1))) model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1])) #print(add_intercept(residualized[:,1])[:10]) ids = get_np_columns(df, ['idcode', 'year'], False)
##print(res.summary) # ######################################################################### PYHDFE TEST for a in list(enumerate(list(df))): print(a) df_np = df.to_numpy() # just a sanity check of straight forward regression #print("ln_wage ~ hours_log") #model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log'])) #results = model.fit() #print(results.summary()) algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False)) residualized = algo.residualize( get_np_columns(df, ['ln_wage', 'hours_log'], False)) #model = sm.OLS(residualized[:,0], add_intercept(residualized[:,1])) #ln_wage ~ hours_log, absorb(year) # OLS Regression Results #============================================================================== #Dep. Variable: y R-squared: 0.005 #Model: OLS Adj. R-squared: 0.005 #Method: Least Squares F-statistic: 69.95 #Date: Sat, 05 Dec 2020 Prob (F-statistic): 6.67e-17 #Time: 13:04:23 Log-Likelihood: -8305.1 #No. Observations: 13452 AIC: 1.661e+04 #Df Residuals: 13450 BIC: 1.663e+04 #Df Model: 1
# Method: Least Squares F-statistic: 22.33 # Date: Sun, 06 Dec 2020 Prob (F-statistic): 3.14e-20 # Time: 14:58:02 Log-Likelihood: -1715.7 # No. Observations: 506 AIC: 3443. # Df Residuals: 500 BIC: 3469. # Df Model: 5 # Covariance Type: nonrobust # ============================================================================== # coef std err t P>|t| [0.025 0.975] # ------------------------------------------------------------------------------ # x1 -0.2089 0.049 -4.294 0.000 -0.305 -0.113 # x2 0.0679 0.018 3.745 0.000 0.032 0.104 # x3 -0.2280 0.085 -2.672 0.008 -0.396 -0.060 # x4 -9.4248 5.506 -1.712 0.088 -20.242 1.392 # x5 -0.0141 0.018 -0.774 0.439 -0.050 0.022 # const -6.592e-17 0.321 -2.05e-16 1.000 -0.631 0.631 # ============================================================================== # Omnibus: 172.457 Durbin-Watson: 0.904 # Prob(Omnibus): 0.000 Jarque-Bera (JB): 532.297 # Skew: 1.621 Prob(JB): 2.59e-116 # Kurtosis: 6.839 Cond. No. 480. # ============================================================================== # algo = pyhdfe.create(get_np_columns(df, ['CHAS', 'RAD'], False)) # # residualized = algo.residualize(get_np_columns(df, ['target', 'CRIM', 'ZN', 'INDUS', 'NOX', 'AGE'], False)) # model = sm.OLS(residualized[:,0], add_intercept(residualized[:, [1, 2, 3, 4, 5]])) # results = model.fit() # print("target~CRIM + ZN + INDUS + NOX + AGE, absorb(CHAS, RAD)") # print(results.summary()) print(np.mean(get_np_columns(df, ['target'], False)))
import numpy as np from linearmodels.iv.absorbing import AbsorbingLS import pyhdfe from utils import add_intercept, get_np_columns from sklearn.datasets import load_boston # details about dataset can be found at https://www.kaggle.com/crawford/80-cereals df = pd.read_csv('/home/abom/Downloads/dataset_cereal/cereal.csv') print(list(df)) #results = smf.ols(formula='rating ~ fat + protein + carbo + sugars', data=df).fit() #print(results.summary()) print(get_np_columns(df, ['cups'], False)[:10]) algo = pyhdfe.create(get_np_columns(df, ['shelf'], False)) # OLS Regression Results # ============================================================================== # Dep. Variable: y R-squared: 0.759 # Model: OLS Adj. R-squared: 0.745 # Method: Least Squares F-statistic: 56.55 # Date: Mon, 07 Dec 2020 Prob (F-statistic): 1.71e-21 # Time: 09:15:25 Log-Likelihood: -252.82 # No. Observations: 77 AIC: 515.6 # Df Residuals: 72 BIC: 527.4 # Df Model: 4 # Covariance Type: nonrobust # ==============================================================================