示例#1
0
def ols_cluster_robust(formula, cluster, covs, coef):
    """Model clusters with cluster-robust OLS, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    res = OLS.from_formula(formula, data=cov_rep).fit(
        cov_type='cluster', cov_kwds=dict(groups=cov_rep['id']))
    return get_ptc(res, coef)
示例#2
0
def capm(y: pd.Series, bases: pd.DataFrame, rf=0., fee=0.):
    freq = _freq(y.index)
    rf = rf / freq
    fee = fee / freq
    R = y.pct_change() - rf
    R.name = y.name
    R_base = bases.pct_change().sub(rf, axis=0)

    # CAPM:
    # R = alpha + rf + beta * (Rm - rf)
    model = OLS.from_formula(f"Q('{y.name}') ~ {'+'.join(bases.columns)}",
                             R_base.join(R)).fit()

    alpha = model.params['Intercept'] * freq
    betas = model.params[bases.columns]

    # reconstruct artificial portfolio
    proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee)
    cumproxy = (1 + proxy).cumprod()

    # residual portfolio
    r = y.pct_change() - cumproxy.pct_change()
    residual = (1 + r).cumprod()

    return {
        'alpha': alpha,
        'betas': betas,
        'cumproxy': cumproxy,
        'model': model,
        'residual': residual,
    }
示例#3
0
文件: __init__.py 项目: toobaz/rmodel
    def from_r_object(cls, rsum, ci=None, debug=False):
        """
        Reconstruct a model from an rpy2 summary object, and optionally its
        confidence intervals.
        These can be easily saved in R with
            save(objname, file=file_name)
        and loaded in Python via rpy2 with
            r['load'](file_name)['objname']

        Parameters
        ----------
        rsum : R object
            R summary of a fitted model.
            Typically produced with "summary(fitted)" (in R).
        ci : R object
            Confidence intervals of the fitted model
            Typically produced with "confint(fitted)" (in R).
        debug : bool, default False
            If True, print debug messages.
        """

        d_res = cls._r_as_dict(None, rsum)

        if not 'terms' in d_res:
            msg = ("Interpreting r objects inside Python is only supported "
                   "for few estimators. More will work using "
                   "RModel.from_rdata() directly.")
            raise NotImplementedError(msg)

        formula = str(d_res['terms']).splitlines()[0]

        # We want to create a fake dataset, and we use patsy to get the list of
        # variables. We are actually creating columns for interactions and
        # functions too... but who cares, identifying them would be at the
        # moment overkill.
        fobj = ModelDesc.from_formula(formula)
        varnames = [t.name()
                    for t in fobj.rhs_termlist + fobj.lhs_termlist][1:]

        # We need to pass some pd.DataFrame to from_formula() below - but it
        # doesn't seem to be actually used.
        data = pd.DataFrame(-1, index=[0], columns=[0])

        # Creating the OLS object and only then hijacking it allows us to best
        # profit of statsmodels' machinery:
        mod = OLS.from_formula(formula, data)
        mod.__class__ = RModel
        # This is now an RModel:
        mod._initialize(debug=debug)

        attrs = mod._inspect_R(rsum, ci=ci)
        wrap = mod._package_attrs(attrs)

        return wrap
示例#4
0
def mixed_model_cluster(formula, cluster, covs, coef):
    """Model clusters with a mixed-model, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    # TODO: remove this once newer version of statsmodels is out.
    # speeds convergence by using fixed estimates from OLS
    params = OLS.from_formula(formula, data=cov_rep).fit().params

    res = MixedLM.from_formula(formula, groups='id',
                               data=cov_rep).fit(start_params=dict(fe=params),
                                                 reml=False,
                                                 method='bfgs')

    return get_ptc(res, coef)
示例#5
0
文件: __init__.py 项目: toobaz/rmodel
    def from_formula(cls,
                     formula,
                     data,
                     command='lm',
                     libraries=[],
                     debug=False,
                     **kwargs):
        """
        Estimate a model by passing a formula and data, in the spirit of
        statsmodels.api.OLS.from_formula().

        Additionally supports the following arguments:

        Parameters
        ----------
        command : string, default 'lm'
            R command used for the estimation.
        libraries : list-like of strings, default empty
            R libraries which should be loaded before the estimation.
        debug : bool, default False
            If True, print debug messages.
        **kwargs : additional arguments
            Arguments to be passed to the R command.
        """

        # Creating the OLS object and only then hijacking it allows us to best
        # profit of statsmodels' machinery:
        mod = OLS.from_formula(formula, data)
        mod.__class__ = RModel
        # This is now an RModel:
        mod._initialize(debug=debug)

        # This holds stuff statsmodels is not aware of, and fit() needs:
        mod._backstage = {
            'libraries': libraries,
            'command': command,
            'full_data': data,
            'kwargs': kwargs
        }

        return mod
示例#6
0
def check_alter_models(pdag, df, score_method=ScoreMethod.BIC):
    if not isinstance(pdag, PDAG):
        raise RuntimeError("'pdag' must be an instance of PDAG class.")
    elif pdag.parameterized is None or not pdag.parameterized:
        raise RuntimeError("'pdag' has to be parameterized first.")

    if not isinstance(df, pd.DataFrame):
        raise RuntimeError("'df' must be a pandas DataFrame.")

    if not isinstance(score_method, ScoreMethod):
        raise RuntimeError("Invalid 'score_method'.")

    score = __bic if score_method == ScoreMethod.BIC else __aic

    for model in pdag.models:
        dep = model["dependent"]
        indeps = model["independents"]
        vars = model["regressors"]
        for ind, v in zip(indeps, vars):
            # for each alternative model remove one indep
            alter = [x for x in indeps if x != ind]
            if len(alter) == 0:
                formula = dep + " ~ 1"
            else:
                formula = dep + " ~ " + " + ".join(alter)
            # fit the alternative model
            if model["model"] == "linear":
                mod = OLS.from_formula(formula=formula, data=df)
            # elif model["model"] == "logit":
            #     mod = Logit.from_formula(formula=formula, data=df)
            elif model["model"] == "mnlogit":
                mod = MNLogit.from_formula(formula=formula, data=df)
            else:
                raise RuntimeError("Invalid model '" + model["model"] + "'.")
            delta = score(mod.fit()) - model["score"]
            # assign delta to all corresponding edges
            edges = pdag.get_edge(v, dep, get_all=True)
            for ed in edges:
                ed.delta_score = delta
obs["month"] = obs.index.month
obs["tests_per_mill"] = obs["tested"] / (india_pop[state_code_lookup[state]] /
                                         1e6)


# define regression formula as function of taylor approximation order for rate-control function
def scaling(order: int) -> str:
    powers = " + ".join(
        f"np.power(tests_per_mill, {i + 1})"
        for i in range(order))  # test rate exponentiation terms
    return f"confirmed ~ -1 + tested + C(month)*({powers})"  # no intercept, regress on tests, interact month indicator with powers


# select order by minimizing AIC where coefficient on number of tests > 0
models = [
    OLS.from_formula(scaling(order), data=obs).fit() for order in range(1, 10)
]
(model_idx, selected_model) = min(
    ((i, each)
     for (i, each) in enumerate(models) if each.params["tested"] > 0),
    key=lambda _: _[1].aic)
print("  i aic     r2   beta")
for (i, model) in enumerate(models):
    print("*" if i == model_idx else " ", i + 1, model.aic.round(2),
          model.rsquared.round(2), model.params["tested"].round(2))
scale_factor = selected_model.params["tested"]

plt.plot(0.2093 * df[state][:, "delta", "tested"],
         label="national test-scaled")
plt.plot(scale_factor * df[state][:, "delta", "tested"],
         label="state test-scaled")
示例#8
0
    'Cumulative total per thousand': "total_per_thousand",
    'Daily change in cumulative total per thousand': "delta_per_thousand",
    '7-day smoothed daily change': "smoothed_delta",
    '7-day smoothed daily change per thousand': "smoothed_delta_per_thousand",
    'Short-term positive rate': "positivity",
    'Short-term tests per case': "tests_per_case"
}

testing = pd.read_csv("data/covid-testing-all-observations.csv",
                      parse_dates=["Date"])
testing = testing[testing["ISO code"] == "IND"]\
            .dropna()\
            [schema.keys()]\
            .rename(columns = schema)
testing["month"] = testing.date.dt.month


def formula(order: int) -> str:
    powers = " + ".join(f"np.power(delta_per_thousand, {i + 1})"
                        for i in range(order))
    return f"smoothed_delta ~ -1 + daily_tests + C(month)*({powers})"


model = OLS.from_formula(formula(order=3), data=testing).fit()
print(summary_col(model, regressor_order=["daily_tests"], drop_omitted=True))

plt.plot(0.2093 * df["TT"][:, "delta", "tested"], label="test-scaled")
plt.plot(df["TT"][:, "delta", "confirmed"], label="confirmed")
plt.legend()
plt.show()
示例#9
0
def parameterize(pdag,
                 df,
                 categorical_columns=[],
                 row_selection=None,
                 scale_method=None,
                 score_method=ScoreMethod.BIC,
                 test_alter_models=False,
                 verbose=False):
    if not isinstance(pdag, PDAG):
        raise RuntimeError("'pdag' must be an instance of PDAG class.")

    if not isinstance(df, pd.DataFrame):
        raise RuntimeError("'df' must be a pandas DataFrame.")

    if df.shape[1] < len(pdag.vertices):
        raise RuntimeError("Number of column is smaler than number of vertex.")

    if scale_method is not None and not isinstance(scale_method, ScaleMethod):
        raise RuntimeError("Invalid 'scale_method' value.")

    if not isinstance(score_method, ScoreMethod):
        raise RuntimeError("Invalid 'score_method'.")

    # drop NA values
    if df.isnull().values.any():
        warnings.warn("Records with missing values are removed.")
        df = df.dropna()

    # return pdag if it is a correlation graph
    if hasattr(pdag, 'type'):
        if pdag.type == "correlation":
            return pdag

    # detect un-selected categorical columns
    for cn in list(df):
        if cn not in categorical_columns and \
                df[cn].dtype not in [np.float64, np.int64,
                                     np.float32, np.int32]:
            warnings.warn("'" + cn + "' is considered categorical.")
            categorical_columns.append(cn)
    # levels of categorical columns
    levels = {}
    for cn in categorical_columns:
        levels[cn] = sorted(list(set(df[cn])))

    # scale numeric columns if necessary
    names = list(df.columns.values)
    if scale_method == ScaleMethod.Normalize:
        for nm in names:
            if nm not in categorical_columns:
                df[nm] = (df[nm] - df[nm].min()) / \
                    (df[nm].max() - df[nm].min())
    elif scale_method == ScaleMethod.Standardize:
        for nm in names:
            if nm not in categorical_columns:
                df[nm] = (df[nm] - df[nm].mean()) / df[nm].std()

    # row selection
    if row_selection is not None and len(row_selection) == df.shape[0]:
        df = df.loc[row_selection, :]

    # create the result pdag initilialized with all nodes and undirected edges
    new_pdag = PDAG(pdag.title)
    for v in pdag.vertices:
        new_pdag.add_vertex(Vertex(v.id, v.name, VertexDataType.Unset))
    for e in pdag.edges:
        if e.direct_type == EdgeType.Nondirected.name:
            new_pdag.add_edge(e.v1.id, e.v2.id, EdgeType.Nondirected)
    # for k, v in pdag.__dict__.items():
    #     if k not in PDAG.fields and k != "models":
    #         new_pdag[k] = v

    # model score
    score = __bic if score_method == ScoreMethod.BIC else __aic
    model_score = 0
    models = []

    # let the party begin
    for v in pdag.vertices:
        new_v = new_pdag.get_vertex_by_name(v.name)
        if v.name in categorical_columns:
            new_v.data_type = VertexDataType.Categorical.name
        else:
            new_v.data_type = VertexDataType.Numeric.name

        cause_names = causes_of(v.name, pdag.edges)
        new_v.causes = cause_names  # adding causes attribute to new_v

        if len(cause_names) == 0:
            # fit to null model and calculate aic/bic
            formula = v.name + " ~ 1"
            print(formula)
            if verbose:
                print(formula)
            if v.name not in categorical_columns:
                mod = OLS.from_formula(formula=formula, data=df)
                res = mod.fit()
                model_score += score(res)
            else:
                mod = MNLogit.from_formula(formula=formula, data=df)
                res = mod.fit()
                model_score += score(res)
        else:
            # fit model with ols, logit, or mnlogit
            # generate formula
            indp = []
            for nm in cause_names:
                if nm in categorical_columns:
                    indp.append("C(" + nm + ")")
                else:
                    indp.append(nm)
            formula = v.name + " ~ " + " + ".join(indp) + " + 1"
            if verbose:
                print(formula)

            if v.name not in categorical_columns:
                # linear regression w/ ols
                mod = OLS.from_formula(formula=formula, data=df)
                res = mod.fit()
                model_score += score(res)
                new_v.model_type = "linear"
                # assign model parameters to new vertex
                new_v.fvalue = res.fvalue
                new_v.f_pvalue = res.f_pvalue
                new_v.rsquared = res.rsquared
                new_v.rsquared_adj = res.rsquared_adj
                new_v.model_score = score(res)
                # assign model coefficients to edges
                tvalues = res.tvalues.to_dict()
                pvalues = res.pvalues.to_dict()
                bse = res.bse.to_dict()
                params = res.params.to_dict()
                conf_int = res.conf_int().to_dict(orient="index")
                new_v.const = {
                    "beta": params["Intercept"],
                    "se": bse["Intercept"],
                    "tvalue": tvalues["Intercept"],
                    "pvalue": pvalues["Intercept"],
                    "conf_int": list(conf_int["Intercept"].values())
                }
                for nm in cause_names:
                    new_v2 = new_pdag.get_vertex_by_name(nm)
                    if nm in categorical_columns:
                        # categorical regressor
                        lvls = levels[nm][1:]
                        ref_level = levels[nm][0]
                        for lvl in lvls:
                            lnm = "C(" + nm + ")[T." + str(lvl) + "]"
                            new_edge = Edge(new_v2, new_v, EdgeType.Directed)
                            new_edge.beta = params[lnm]
                            new_edge.se = bse[lnm]
                            new_edge.tvalue = tvalues[lnm]
                            new_edge.pvalue = pvalues[lnm]
                            new_edge.conf_int = list(conf_int[lnm].values())
                            new_edge.ref_level = ref_level
                            new_edge.level = lvl
                            new_pdag.edges.append(new_edge)
                    else:
                        # numeric regressor
                        new_edge = Edge(new_v2, new_v, EdgeType.Directed)
                        new_edge.beta = params[nm]
                        new_edge.se = bse[nm]
                        new_edge.tvalue = tvalues[nm]
                        new_edge.pvalue = pvalues[nm]
                        new_edge.conf_int = list(conf_int[nm].values())
                        new_pdag.edges.append(new_edge)
                models.append(
                    build_model(dependent=v.name,
                                independents=indp,
                                regressors=cause_names,
                                model='linear',
                                rsquared=res.rsquared,
                                rsquared_adj=res.rsquared_adj,
                                fvalue=res.fvalue,
                                f_pvalue=res.f_pvalue,
                                score=score(res),
                                betas=params,
                                tvalues=tvalues,
                                pvalues=pvalues,
                                se=bse,
                                conf_int=conf_int))
            else:
                # multi-nomial logistic regression w/ mnlogit
                mod = MNLogit.from_formula(formula=formula, data=df)
                res = mod.fit()
                model_score += score(res)
                new_v.model_type = "mnlogit"
                new_v.levels = levels[v.name]
                # assign model parameters to new vertex
                new_v.ll = res.llf
                new_v.llr = res.llr
                new_v.llr_pvalue = res.llr_pvalue
                new_v.rsquared_pseudo = res.prsquared
                new_v.model_score = score(res)
                # assign model coefficients to edges
                tvalues = res.tvalues.to_dict(orient='index')
                pvalues = res.pvalues.to_dict(orient='index')
                bse = res.bse.to_dict(orient='index')
                params = res.params.to_dict(orient='index')
                new_v.const = {}
                for l in range(0, len(levels[v.name]) - 1):
                    new_v.const[levels[v.name][l + 1]] = {
                        "beta": params["Intercept"][l],
                        "se": bse["Intercept"][l],
                        "tvalue": tvalues["Intercept"][l],
                        "pvalue": pvalues["Intercept"][l]
                    }
                    # for each level of target variable v/_new
                    for nm in cause_names:
                        new_v2 = new_pdag.get_vertex_by_name(nm)
                        if nm in categorical_columns:
                            # categorical regression
                            lvls = levels[nm][1:]
                            ref_level = levels[nm][0]
                            for lvl in lvls:
                                lnm = "C(" + nm + ")[T." + str(lvl) + "]"
                                new_edge = Edge(new_v2, new_v,
                                                EdgeType.Directed)
                                new_edge.beta = params[lnm][l]
                                new_edge.se = bse[lnm][l]
                                new_edge.tvalue = tvalues[lnm][l]
                                new_edge.pvalue = pvalues[lnm][l]
                                new_edge.ref_level = ref_level
                                new_edge.level = lvl
                                new_edge.tar_level = levels[v.name][l + 1]
                                new_pdag.edges.append(new_edge)
                        else:
                            # numeric regression
                            new_edge = Edge(new_v2, new_v, EdgeType.Directed)
                            new_edge.beta = params[nm][l]
                            new_edge.se = bse[nm][l]
                            new_edge.tvalue = tvalues[nm][l]
                            new_edge.pvalue = pvalues[nm][l]
                            new_edge.tar_level = levels[v.name][l + 1]
                            new_pdag.edges.append(new_edge)
                models.append(
                    build_model(dependent=v.name,
                                independents=indp,
                                regressors=cause_names,
                                model='mnlogit',
                                ll=res.llf,
                                llr=res.llr,
                                llr_pvalue=res.llr_pvalue,
                                rsquared_pseudo=res.prsquared,
                                score=score(res),
                                betas=params,
                                tvalues=tvalues,
                                pvalues=pvalues,
                                se=bse))

    new_pdag.models = models
    new_pdag.score_method = score_method.name
    new_pdag.score = model_score

    new_pdag.parameterized = True

    if test_alter_models:
        check_alter_models(new_pdag, df, score_method)

    return new_pdag
示例#10
0
文件: __init__.py 项目: toobaz/rmodel
    def from_rda(cls, filename, objname, debug=False):
        """
        Load the summary of an R model from a .rda file as statsmodels-like
        estimation result.
        Such file can be created with the "save()" command in R.

        Parameters
        ----------
        filename : str
            Path of the file to load from.
        objname : str
            Name of the object to load from the file.
        debug : bool, default False
            If True, print debug messages.

        Examples
        --------
        If an object named "regsum" was saved in R with the command
            save(regsum, file = "/home/pietro/r_results.rda")
        then it can be reloaded by calling this command as
            res = RModel.from_rda("/home/pietro/r_results.rda", "regsum")
        """

        r['load'](filename)

        d_res = cls._r_as_dict(None, r[objname])
        try:
            ci = r("ci <- confint({})".format(objname))
        except embedded.RRuntimeError:
            ci = None

        # FIXME: while this works differently from the code building the
        # coefficients matrix in _inspect_R (which does not retrieve from R),
        # there is clearly room for de-duplication.
        coefs = cls._get_coeffs_mat(None, objname)

        items = list(coefs.index)

        try:
            # E.g. mfx marginal effects
            target = str(d_res['call'][1][1])
            formula = " ~ ".join([target, " + ".join(items)])
            columns = [target] + items
        except IndexError:
            # E.g. OLS
            # This is ugly...
            formula = str(d_res['terms']).splitlines()[0]
            target = formula.split(' ')[0].split('~')[0]

        data = pd.DataFrame(-1, index=[0], columns=[target] + items)

        # Creating the OLS object and only then hijacking it allows us to best
        # profit of statsmodels' machinery:
        mod = OLS.from_formula(formula, data)
        mod.__class__ = RModel
        # This is now an RModel:
        mod._initialize(debug=debug)

        attrs = mod._inspect_R(objname)
        wrap = mod._package_attrs(attrs)

        return wrap