def ols_cluster_robust(formula, cluster, covs, coef): """Model clusters with cluster-robust OLS, same signature as :func:`~gee_cluster`""" cov_rep = long_covs(covs, np.array([f.values for f in cluster])) res = OLS.from_formula(formula, data=cov_rep).fit( cov_type='cluster', cov_kwds=dict(groups=cov_rep['id'])) return get_ptc(res, coef)
def capm(y: pd.Series, bases: pd.DataFrame, rf=0., fee=0.): freq = _freq(y.index) rf = rf / freq fee = fee / freq R = y.pct_change() - rf R.name = y.name R_base = bases.pct_change().sub(rf, axis=0) # CAPM: # R = alpha + rf + beta * (Rm - rf) model = OLS.from_formula(f"Q('{y.name}') ~ {'+'.join(bases.columns)}", R_base.join(R)).fit() alpha = model.params['Intercept'] * freq betas = model.params[bases.columns] # reconstruct artificial portfolio proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee) cumproxy = (1 + proxy).cumprod() # residual portfolio r = y.pct_change() - cumproxy.pct_change() residual = (1 + r).cumprod() return { 'alpha': alpha, 'betas': betas, 'cumproxy': cumproxy, 'model': model, 'residual': residual, }
def from_r_object(cls, rsum, ci=None, debug=False): """ Reconstruct a model from an rpy2 summary object, and optionally its confidence intervals. These can be easily saved in R with save(objname, file=file_name) and loaded in Python via rpy2 with r['load'](file_name)['objname'] Parameters ---------- rsum : R object R summary of a fitted model. Typically produced with "summary(fitted)" (in R). ci : R object Confidence intervals of the fitted model Typically produced with "confint(fitted)" (in R). debug : bool, default False If True, print debug messages. """ d_res = cls._r_as_dict(None, rsum) if not 'terms' in d_res: msg = ("Interpreting r objects inside Python is only supported " "for few estimators. More will work using " "RModel.from_rdata() directly.") raise NotImplementedError(msg) formula = str(d_res['terms']).splitlines()[0] # We want to create a fake dataset, and we use patsy to get the list of # variables. We are actually creating columns for interactions and # functions too... but who cares, identifying them would be at the # moment overkill. fobj = ModelDesc.from_formula(formula) varnames = [t.name() for t in fobj.rhs_termlist + fobj.lhs_termlist][1:] # We need to pass some pd.DataFrame to from_formula() below - but it # doesn't seem to be actually used. data = pd.DataFrame(-1, index=[0], columns=[0]) # Creating the OLS object and only then hijacking it allows us to best # profit of statsmodels' machinery: mod = OLS.from_formula(formula, data) mod.__class__ = RModel # This is now an RModel: mod._initialize(debug=debug) attrs = mod._inspect_R(rsum, ci=ci) wrap = mod._package_attrs(attrs) return wrap
def mixed_model_cluster(formula, cluster, covs, coef): """Model clusters with a mixed-model, same signature as :func:`~gee_cluster`""" cov_rep = long_covs(covs, np.array([f.values for f in cluster])) # TODO: remove this once newer version of statsmodels is out. # speeds convergence by using fixed estimates from OLS params = OLS.from_formula(formula, data=cov_rep).fit().params res = MixedLM.from_formula(formula, groups='id', data=cov_rep).fit(start_params=dict(fe=params), reml=False, method='bfgs') return get_ptc(res, coef)
def from_formula(cls, formula, data, command='lm', libraries=[], debug=False, **kwargs): """ Estimate a model by passing a formula and data, in the spirit of statsmodels.api.OLS.from_formula(). Additionally supports the following arguments: Parameters ---------- command : string, default 'lm' R command used for the estimation. libraries : list-like of strings, default empty R libraries which should be loaded before the estimation. debug : bool, default False If True, print debug messages. **kwargs : additional arguments Arguments to be passed to the R command. """ # Creating the OLS object and only then hijacking it allows us to best # profit of statsmodels' machinery: mod = OLS.from_formula(formula, data) mod.__class__ = RModel # This is now an RModel: mod._initialize(debug=debug) # This holds stuff statsmodels is not aware of, and fit() needs: mod._backstage = { 'libraries': libraries, 'command': command, 'full_data': data, 'kwargs': kwargs } return mod
def check_alter_models(pdag, df, score_method=ScoreMethod.BIC): if not isinstance(pdag, PDAG): raise RuntimeError("'pdag' must be an instance of PDAG class.") elif pdag.parameterized is None or not pdag.parameterized: raise RuntimeError("'pdag' has to be parameterized first.") if not isinstance(df, pd.DataFrame): raise RuntimeError("'df' must be a pandas DataFrame.") if not isinstance(score_method, ScoreMethod): raise RuntimeError("Invalid 'score_method'.") score = __bic if score_method == ScoreMethod.BIC else __aic for model in pdag.models: dep = model["dependent"] indeps = model["independents"] vars = model["regressors"] for ind, v in zip(indeps, vars): # for each alternative model remove one indep alter = [x for x in indeps if x != ind] if len(alter) == 0: formula = dep + " ~ 1" else: formula = dep + " ~ " + " + ".join(alter) # fit the alternative model if model["model"] == "linear": mod = OLS.from_formula(formula=formula, data=df) # elif model["model"] == "logit": # mod = Logit.from_formula(formula=formula, data=df) elif model["model"] == "mnlogit": mod = MNLogit.from_formula(formula=formula, data=df) else: raise RuntimeError("Invalid model '" + model["model"] + "'.") delta = score(mod.fit()) - model["score"] # assign delta to all corresponding edges edges = pdag.get_edge(v, dep, get_all=True) for ed in edges: ed.delta_score = delta
obs["month"] = obs.index.month obs["tests_per_mill"] = obs["tested"] / (india_pop[state_code_lookup[state]] / 1e6) # define regression formula as function of taylor approximation order for rate-control function def scaling(order: int) -> str: powers = " + ".join( f"np.power(tests_per_mill, {i + 1})" for i in range(order)) # test rate exponentiation terms return f"confirmed ~ -1 + tested + C(month)*({powers})" # no intercept, regress on tests, interact month indicator with powers # select order by minimizing AIC where coefficient on number of tests > 0 models = [ OLS.from_formula(scaling(order), data=obs).fit() for order in range(1, 10) ] (model_idx, selected_model) = min( ((i, each) for (i, each) in enumerate(models) if each.params["tested"] > 0), key=lambda _: _[1].aic) print(" i aic r2 beta") for (i, model) in enumerate(models): print("*" if i == model_idx else " ", i + 1, model.aic.round(2), model.rsquared.round(2), model.params["tested"].round(2)) scale_factor = selected_model.params["tested"] plt.plot(0.2093 * df[state][:, "delta", "tested"], label="national test-scaled") plt.plot(scale_factor * df[state][:, "delta", "tested"], label="state test-scaled")
'Cumulative total per thousand': "total_per_thousand", 'Daily change in cumulative total per thousand': "delta_per_thousand", '7-day smoothed daily change': "smoothed_delta", '7-day smoothed daily change per thousand': "smoothed_delta_per_thousand", 'Short-term positive rate': "positivity", 'Short-term tests per case': "tests_per_case" } testing = pd.read_csv("data/covid-testing-all-observations.csv", parse_dates=["Date"]) testing = testing[testing["ISO code"] == "IND"]\ .dropna()\ [schema.keys()]\ .rename(columns = schema) testing["month"] = testing.date.dt.month def formula(order: int) -> str: powers = " + ".join(f"np.power(delta_per_thousand, {i + 1})" for i in range(order)) return f"smoothed_delta ~ -1 + daily_tests + C(month)*({powers})" model = OLS.from_formula(formula(order=3), data=testing).fit() print(summary_col(model, regressor_order=["daily_tests"], drop_omitted=True)) plt.plot(0.2093 * df["TT"][:, "delta", "tested"], label="test-scaled") plt.plot(df["TT"][:, "delta", "confirmed"], label="confirmed") plt.legend() plt.show()
def parameterize(pdag, df, categorical_columns=[], row_selection=None, scale_method=None, score_method=ScoreMethod.BIC, test_alter_models=False, verbose=False): if not isinstance(pdag, PDAG): raise RuntimeError("'pdag' must be an instance of PDAG class.") if not isinstance(df, pd.DataFrame): raise RuntimeError("'df' must be a pandas DataFrame.") if df.shape[1] < len(pdag.vertices): raise RuntimeError("Number of column is smaler than number of vertex.") if scale_method is not None and not isinstance(scale_method, ScaleMethod): raise RuntimeError("Invalid 'scale_method' value.") if not isinstance(score_method, ScoreMethod): raise RuntimeError("Invalid 'score_method'.") # drop NA values if df.isnull().values.any(): warnings.warn("Records with missing values are removed.") df = df.dropna() # return pdag if it is a correlation graph if hasattr(pdag, 'type'): if pdag.type == "correlation": return pdag # detect un-selected categorical columns for cn in list(df): if cn not in categorical_columns and \ df[cn].dtype not in [np.float64, np.int64, np.float32, np.int32]: warnings.warn("'" + cn + "' is considered categorical.") categorical_columns.append(cn) # levels of categorical columns levels = {} for cn in categorical_columns: levels[cn] = sorted(list(set(df[cn]))) # scale numeric columns if necessary names = list(df.columns.values) if scale_method == ScaleMethod.Normalize: for nm in names: if nm not in categorical_columns: df[nm] = (df[nm] - df[nm].min()) / \ (df[nm].max() - df[nm].min()) elif scale_method == ScaleMethod.Standardize: for nm in names: if nm not in categorical_columns: df[nm] = (df[nm] - df[nm].mean()) / df[nm].std() # row selection if row_selection is not None and len(row_selection) == df.shape[0]: df = df.loc[row_selection, :] # create the result pdag initilialized with all nodes and undirected edges new_pdag = PDAG(pdag.title) for v in pdag.vertices: new_pdag.add_vertex(Vertex(v.id, v.name, VertexDataType.Unset)) for e in pdag.edges: if e.direct_type == EdgeType.Nondirected.name: new_pdag.add_edge(e.v1.id, e.v2.id, EdgeType.Nondirected) # for k, v in pdag.__dict__.items(): # if k not in PDAG.fields and k != "models": # new_pdag[k] = v # model score score = __bic if score_method == ScoreMethod.BIC else __aic model_score = 0 models = [] # let the party begin for v in pdag.vertices: new_v = new_pdag.get_vertex_by_name(v.name) if v.name in categorical_columns: new_v.data_type = VertexDataType.Categorical.name else: new_v.data_type = VertexDataType.Numeric.name cause_names = causes_of(v.name, pdag.edges) new_v.causes = cause_names # adding causes attribute to new_v if len(cause_names) == 0: # fit to null model and calculate aic/bic formula = v.name + " ~ 1" print(formula) if verbose: print(formula) if v.name not in categorical_columns: mod = OLS.from_formula(formula=formula, data=df) res = mod.fit() model_score += score(res) else: mod = MNLogit.from_formula(formula=formula, data=df) res = mod.fit() model_score += score(res) else: # fit model with ols, logit, or mnlogit # generate formula indp = [] for nm in cause_names: if nm in categorical_columns: indp.append("C(" + nm + ")") else: indp.append(nm) formula = v.name + " ~ " + " + ".join(indp) + " + 1" if verbose: print(formula) if v.name not in categorical_columns: # linear regression w/ ols mod = OLS.from_formula(formula=formula, data=df) res = mod.fit() model_score += score(res) new_v.model_type = "linear" # assign model parameters to new vertex new_v.fvalue = res.fvalue new_v.f_pvalue = res.f_pvalue new_v.rsquared = res.rsquared new_v.rsquared_adj = res.rsquared_adj new_v.model_score = score(res) # assign model coefficients to edges tvalues = res.tvalues.to_dict() pvalues = res.pvalues.to_dict() bse = res.bse.to_dict() params = res.params.to_dict() conf_int = res.conf_int().to_dict(orient="index") new_v.const = { "beta": params["Intercept"], "se": bse["Intercept"], "tvalue": tvalues["Intercept"], "pvalue": pvalues["Intercept"], "conf_int": list(conf_int["Intercept"].values()) } for nm in cause_names: new_v2 = new_pdag.get_vertex_by_name(nm) if nm in categorical_columns: # categorical regressor lvls = levels[nm][1:] ref_level = levels[nm][0] for lvl in lvls: lnm = "C(" + nm + ")[T." + str(lvl) + "]" new_edge = Edge(new_v2, new_v, EdgeType.Directed) new_edge.beta = params[lnm] new_edge.se = bse[lnm] new_edge.tvalue = tvalues[lnm] new_edge.pvalue = pvalues[lnm] new_edge.conf_int = list(conf_int[lnm].values()) new_edge.ref_level = ref_level new_edge.level = lvl new_pdag.edges.append(new_edge) else: # numeric regressor new_edge = Edge(new_v2, new_v, EdgeType.Directed) new_edge.beta = params[nm] new_edge.se = bse[nm] new_edge.tvalue = tvalues[nm] new_edge.pvalue = pvalues[nm] new_edge.conf_int = list(conf_int[nm].values()) new_pdag.edges.append(new_edge) models.append( build_model(dependent=v.name, independents=indp, regressors=cause_names, model='linear', rsquared=res.rsquared, rsquared_adj=res.rsquared_adj, fvalue=res.fvalue, f_pvalue=res.f_pvalue, score=score(res), betas=params, tvalues=tvalues, pvalues=pvalues, se=bse, conf_int=conf_int)) else: # multi-nomial logistic regression w/ mnlogit mod = MNLogit.from_formula(formula=formula, data=df) res = mod.fit() model_score += score(res) new_v.model_type = "mnlogit" new_v.levels = levels[v.name] # assign model parameters to new vertex new_v.ll = res.llf new_v.llr = res.llr new_v.llr_pvalue = res.llr_pvalue new_v.rsquared_pseudo = res.prsquared new_v.model_score = score(res) # assign model coefficients to edges tvalues = res.tvalues.to_dict(orient='index') pvalues = res.pvalues.to_dict(orient='index') bse = res.bse.to_dict(orient='index') params = res.params.to_dict(orient='index') new_v.const = {} for l in range(0, len(levels[v.name]) - 1): new_v.const[levels[v.name][l + 1]] = { "beta": params["Intercept"][l], "se": bse["Intercept"][l], "tvalue": tvalues["Intercept"][l], "pvalue": pvalues["Intercept"][l] } # for each level of target variable v/_new for nm in cause_names: new_v2 = new_pdag.get_vertex_by_name(nm) if nm in categorical_columns: # categorical regression lvls = levels[nm][1:] ref_level = levels[nm][0] for lvl in lvls: lnm = "C(" + nm + ")[T." + str(lvl) + "]" new_edge = Edge(new_v2, new_v, EdgeType.Directed) new_edge.beta = params[lnm][l] new_edge.se = bse[lnm][l] new_edge.tvalue = tvalues[lnm][l] new_edge.pvalue = pvalues[lnm][l] new_edge.ref_level = ref_level new_edge.level = lvl new_edge.tar_level = levels[v.name][l + 1] new_pdag.edges.append(new_edge) else: # numeric regression new_edge = Edge(new_v2, new_v, EdgeType.Directed) new_edge.beta = params[nm][l] new_edge.se = bse[nm][l] new_edge.tvalue = tvalues[nm][l] new_edge.pvalue = pvalues[nm][l] new_edge.tar_level = levels[v.name][l + 1] new_pdag.edges.append(new_edge) models.append( build_model(dependent=v.name, independents=indp, regressors=cause_names, model='mnlogit', ll=res.llf, llr=res.llr, llr_pvalue=res.llr_pvalue, rsquared_pseudo=res.prsquared, score=score(res), betas=params, tvalues=tvalues, pvalues=pvalues, se=bse)) new_pdag.models = models new_pdag.score_method = score_method.name new_pdag.score = model_score new_pdag.parameterized = True if test_alter_models: check_alter_models(new_pdag, df, score_method) return new_pdag
def from_rda(cls, filename, objname, debug=False): """ Load the summary of an R model from a .rda file as statsmodels-like estimation result. Such file can be created with the "save()" command in R. Parameters ---------- filename : str Path of the file to load from. objname : str Name of the object to load from the file. debug : bool, default False If True, print debug messages. Examples -------- If an object named "regsum" was saved in R with the command save(regsum, file = "/home/pietro/r_results.rda") then it can be reloaded by calling this command as res = RModel.from_rda("/home/pietro/r_results.rda", "regsum") """ r['load'](filename) d_res = cls._r_as_dict(None, r[objname]) try: ci = r("ci <- confint({})".format(objname)) except embedded.RRuntimeError: ci = None # FIXME: while this works differently from the code building the # coefficients matrix in _inspect_R (which does not retrieve from R), # there is clearly room for de-duplication. coefs = cls._get_coeffs_mat(None, objname) items = list(coefs.index) try: # E.g. mfx marginal effects target = str(d_res['call'][1][1]) formula = " ~ ".join([target, " + ".join(items)]) columns = [target] + items except IndexError: # E.g. OLS # This is ugly... formula = str(d_res['terms']).splitlines()[0] target = formula.split(' ')[0].split('~')[0] data = pd.DataFrame(-1, index=[0], columns=[target] + items) # Creating the OLS object and only then hijacking it allows us to best # profit of statsmodels' machinery: mod = OLS.from_formula(formula, data) mod.__class__ = RModel # This is now an RModel: mod._initialize(debug=debug) attrs = mod._inspect_R(objname) wrap = mod._package_attrs(attrs) return wrap