示例#1
0
    def __init__(self,
                 df,
                 target,
                 predictors,
                 ids,
                 cluster_ids=[],
                 drop_singletons=True):
        """
        Args:
            target (string): name of target variable
            predictors (list of strings): names of predictors
            ids (list of strings): names of variables to be absorbed
            df (pandas Dataframe): dataframe containing referenced data
                                    which includes target, predictors and ids
        """
        self.df = df
        self.algo = pyhdfe.create(ids=get_np_columns(df, ids),
                                  cluster_ids=get_np_columns(df, cluster_ids),
                                  drop_singletons=drop_singletons,
                                  degrees_method='pairwise')
        self.all_names = [target] + predictors
        self.residualized = self.algo.residualize(
            get_np_columns(df, [target] + predictors + cluster_ids))
        self.formula = target + '~' + predictors[0]
        for name in predictors[1:]:
            self.formula = self.formula + '+' + name
        self.formula = self.formula + '-1'
        df_residualized = pd.DataFrame()
        for i, name in enumerate(self.all_names):
            df_residualized[name] = self.residualized[:, i]

        y, X = dmatrices(self.formula,
                         data=df_residualized,
                         return_type='dataframe')
        self.model = sm.OLS(y, X)
        if cluster_ids == []:
            self.model.df_resid = self.residualized.shape[0] - len(
                predictors) - self.algo.degrees
        else:
            clusters = get_np_columns(
                df, cluster_ids)[~self.algo._singleton_indices]
            min_cluster_count = np.unique(clusters[:, 0]).shape[0]
            for i in range(1, clusters.shape[1]):
                current_count = np.unique(clusters[:, i]).shape[0]
                if current_count < min_cluster_count:
                    min_cluster_count = current_count

            self.model.df_resid = min_cluster_count - len(predictors)
示例#2
0
 def fit(self, **kwargs):
     groups_dict = {}
     if 'cov_type' in kwargs:
         groups = get_np_columns(self.df, kwargs['groups'])
         cleaned_groups = groups[np.logical_not(
             self.algo._singleton_indices)]
         temp = self.model.fit()
         return self.model.fit(cov_type='cluster',
                               cov_kwds={'groups': cleaned_groups})
     else:
         return self.model.fit(**kwargs)
示例#3
0
#results = smf.ols(formula='wks_work~1', data=df).fit()
# y = np.asarray([[-100, 100, -100, 1234]]).T
# X = np.asarray([[1,2 ,3, 4], [1, 1, 1, 1]]).T
# results = sm.OLS(y, X).fit()
model = smf.ols(formula='ttl_exp~wks_ue', data=df)
results = model.fit()
import pdb
pdb.set_trace()
print(results.predict()[-10:])
print(results.summary())
#results = smf.wls(formula='wks_work~ttl_exp', data=df, weights=np.linspace(1, 13452, 13452)).fit()
model = smf.glm(formula='ttl_exp~wks_ue',
                freq_weights=np.linspace(1, 13452, 13452),
                data=df)
results = model.fit()
print(results.summary())
lin_pred = model.mu
target = get_np_columns(df, ['ttl_exp']).squeeze()
resid = target - lin_pred
mse_resid = np.sum(resid**2) / 90484876
explained_sum_of_squares = np.sum(
    (target - np.mean(target))**2) - np.sum(resid**2)
#that 1 there is df.model
mse_model = explained_sum_of_squares / 1
F = mse_model / mse_resid
print(F)
import pdb
pdb.set_trace()
#print("df_model", results.df_model)
#print("df_resid", results.df_resid)
示例#4
0
#
######################################################################### PYHDFE TEST

# for a in list(enumerate(list(df))):
#     print(a)

df_np = df.to_numpy()

# just a sanity check of straight forward regression
#print("ln_wage ~ hours_log")
#model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log']))
#results = model.fit()
#print(results.summary())


algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False),
                        degrees_method='pairwise')
residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False))

print(algo.degrees)

import pdb; pdb.set_trace()



#model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1)))
model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1]))

#print(add_intercept(residualized[:,1])[:10])

ids = get_np_columns(df, ['idcode', 'year'], False)
示例#5
0
##print(res.summary)
#
######################################################################### PYHDFE TEST

for a in list(enumerate(list(df))):
    print(a)

df_np = df.to_numpy()

# just a sanity check of straight forward regression
#print("ln_wage ~ hours_log")
#model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log']))
#results = model.fit()
#print(results.summary())

algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False))
residualized = algo.residualize(
    get_np_columns(df, ['ln_wage', 'hours_log'], False))

#model = sm.OLS(residualized[:,0], add_intercept(residualized[:,1]))
#ln_wage ~ hours_log, absorb(year)
#                            OLS Regression Results
#==============================================================================
#Dep. Variable:                      y   R-squared:                       0.005
#Model:                            OLS   Adj. R-squared:                  0.005
#Method:                 Least Squares   F-statistic:                     69.95
#Date:                Sat, 05 Dec 2020   Prob (F-statistic):           6.67e-17
#Time:                        13:04:23   Log-Likelihood:                -8305.1
#No. Observations:               13452   AIC:                         1.661e+04
#Df Residuals:                   13450   BIC:                         1.663e+04
#Df Model:                           1
示例#6
0
# Method:                 Least Squares   F-statistic:                     22.33
# Date:                Sun, 06 Dec 2020   Prob (F-statistic):           3.14e-20
# Time:                        14:58:02   Log-Likelihood:                -1715.7
# No. Observations:                 506   AIC:                             3443.
# Df Residuals:                     500   BIC:                             3469.
# Df Model:                           5
# Covariance Type:            nonrobust
# ==============================================================================
#                  coef    std err          t      P>|t|      [0.025      0.975]
# ------------------------------------------------------------------------------
# x1            -0.2089      0.049     -4.294      0.000      -0.305      -0.113
# x2             0.0679      0.018      3.745      0.000       0.032       0.104
# x3            -0.2280      0.085     -2.672      0.008      -0.396      -0.060
# x4            -9.4248      5.506     -1.712      0.088     -20.242       1.392
# x5            -0.0141      0.018     -0.774      0.439      -0.050       0.022
# const      -6.592e-17      0.321  -2.05e-16      1.000      -0.631       0.631
# ==============================================================================
# Omnibus:                      172.457   Durbin-Watson:                   0.904
# Prob(Omnibus):                  0.000   Jarque-Bera (JB):              532.297
# Skew:                           1.621   Prob(JB):                    2.59e-116
# Kurtosis:                       6.839   Cond. No.                         480.
# ==============================================================================
# algo = pyhdfe.create(get_np_columns(df, ['CHAS', 'RAD'], False))
#
# residualized = algo.residualize(get_np_columns(df, ['target', 'CRIM', 'ZN', 'INDUS', 'NOX', 'AGE'], False))
# model = sm.OLS(residualized[:,0], add_intercept(residualized[:, [1, 2, 3, 4, 5]]))
# results = model.fit()
# print("target~CRIM + ZN + INDUS + NOX + AGE, absorb(CHAS, RAD)")
# print(results.summary())
print(np.mean(get_np_columns(df, ['target'], False)))
示例#7
0
import numpy as np
from linearmodels.iv.absorbing import AbsorbingLS
import pyhdfe
from utils import add_intercept, get_np_columns

from sklearn.datasets import load_boston

# details about dataset can be found at https://www.kaggle.com/crawford/80-cereals
df = pd.read_csv('/home/abom/Downloads/dataset_cereal/cereal.csv')

print(list(df))

#results = smf.ols(formula='rating ~ fat + protein + carbo + sugars', data=df).fit()
#print(results.summary())

print(get_np_columns(df, ['cups'], False)[:10])

algo = pyhdfe.create(get_np_columns(df, ['shelf'], False))

#                             OLS Regression Results
# ==============================================================================
# Dep. Variable:                      y   R-squared:                       0.759
# Model:                            OLS   Adj. R-squared:                  0.745
# Method:                 Least Squares   F-statistic:                     56.55
# Date:                Mon, 07 Dec 2020   Prob (F-statistic):           1.71e-21
# Time:                        09:15:25   Log-Likelihood:                -252.82
# No. Observations:                  77   AIC:                             515.6
# Df Residuals:                      72   BIC:                             527.4
# Df Model:                           4
# Covariance Type:            nonrobust
# ==============================================================================