Пример #1
0
def add_interaction_formula(interaction1, interaction2):
    return paste(
        [
            f"{i} * {j}"
            for i, j in itertools.product(interaction1, interaction2)
        ],
        collapse=" + ",
    )
Пример #2
0
def add_poly(df, col, degree, inplace=False):
    if not inplace:
        df = df.copy(deep=True)
    formula = [col]
    for i in range(2, degree + 1):
        col_name = f"{col}^{i}"
        df[col_name] = df[col]**i
        formula.append(col_name)
    return df, paste(formula, collapse=" + ")
Пример #3
0
def add_poly_formula(col, degree):
    return paste([col] + [f"I({col} ** {i})" for i in range(2, degree + 1)],
                 collapse=" + ")
def test_paste():
    a = "beta"
    b = ["a", "b", "c"]
    c = range(5)
    expect = ["beta.a.0", "beta.b.1", "beta.c.2", "beta.a.3", "beta.b.4"]
    assert paste(a, b, c, sep=".") == expect
def statistics_coefficient(all_N, all_T, nsims, df_sim_result, **beta_true):
    """
    Generate statistics of each N & T, take the mean of different simulations, and
    store them in a data frame. We include mean, bias, the RMSE, standard error and
    cofidence interval in our statistical results.

    Parameters
    ----------
    all_N : array-like
        Different sample sizes of entity
    all_T : array-like
        Different sample sizes of time
    nsims : int
        Simulation times under the same N and T
    df_sim_result : DataFrame
        Simulation results from function `simulation_coefficient`
    beta_true : float
        Coefficients of variables used in dgp_func. Values in ("beta1", "beta2", "mu",
        "gamma", "delta")

    """
    # vectorize startswith() to apply it in a string list
    startswith_vec = np.vectorize(str.startswith)
    # guess number of variables from column names
    p = sum(startswith_vec(df_sim_result.columns, "beta_interactive."))
    assert len(beta_true) >= p, "short of beta_true"
    beta_true_list = [
        beta_true[k]
        for k in ("beta1", "beta2", "mu", "gamma", "delta")
        if k in beta_true
    ][:p]
    df_statistic = pd.DataFrame(
        index=range(len(all_N)),
        columns=[
            "T",
            "N",
            *paste("mean_interactive", range(1, p + 1), sep="."),
            *paste("bias_interactive", range(1, p + 1), sep="."),
            *paste("sde_interactive", range(1, p + 1), sep="."),
            *paste("ci_l_interactive", range(1, p + 1), sep="."),
            *paste("ci_u_interactive", range(1, p + 1), sep="."),
            *paste("rmse_interactive", range(1, p + 1), sep="."),
            *paste("mean_within", range(1, p + 1), sep="."),
            *paste("bias_within", range(1, p + 1), sep="."),
            *paste("sde_within", range(1, p + 1), sep="."),
            *paste("ci_l_within", range(1, p + 1), sep="."),
            *paste("ci_u_within", range(1, p + 1), sep="."),
            *paste("rmse_within", range(1, p + 1), sep="."),
        ],
    )

    # three quick function to get and modify df_statistic and df_sim_result

    def get_stat(col):
        return df_statistic.iloc[i, startswith_vec(df_statistic.columns, col)]

    def get_sim(col):
        return df_sim_result.iloc[
            row_range_df_sim, startswith_vec(df_sim_result.columns, col)
        ]

    def set_stat(col, value):
        df_statistic.iloc[i, startswith_vec(df_statistic.columns, col)] = value

    for i in range(len(all_N)):
        df_statistic.loc[i, "N"] = all_N[i]
        df_statistic.loc[i, "T"] = all_T[i]
        row_range_df_sim = range(i * nsims, (i + 1) * nsims)
        set_stat("mean_interactive", get_sim("beta_interactive").mean())
        set_stat("mean_within", get_sim("beta_within").mean())
        set_stat(
            "bias_interactive",
            get_stat("mean_interactive").sub(beta_true_list).abs().div(beta_true_list),
        )
        set_stat(
            "bias_within",
            get_stat("mean_within").sub(beta_true_list).abs().div(beta_true_list),
        )
        if not np.isnan(get_sim("sde_interactive")).all(axis=None, skipna=False):
            set_stat("sde_interactive", get_sim("sde_interactive").mean())
            set_stat(
                "ci_l_interactive",
                get_stat("mean_interactive").sub(
                    get_stat("sde_interactive").mul(norm.ppf(0.975)).values
                ),
            )
            set_stat(
                "ci_u_interactive",
                get_stat("mean_interactive").add(
                    get_stat("sde_interactive").mul(norm.ppf(0.975)).values
                ),
            )
        set_stat("sde_within", get_sim("sde_within").mean())
        set_stat(
            "ci_l_within",
            get_stat("mean_within").sub(
                get_stat("sde_within").mul(norm.ppf(0.975)).values
            ),
        )
        set_stat(
            "ci_u_within",
            get_stat("mean_within").add(
                get_stat("sde_within").mul(norm.ppf(0.975)).values
            ),
        )
        set_stat(
            "rmse_interactive",
            caculate_rmse(get_sim("beta_interactive"), beta_true_list),
        )
        set_stat("rmse_within", caculate_rmse(get_sim("beta_within"), beta_true_list))
    return df_statistic
        sde_interactive = np.sqrt(np.diag(sde_interactive))
    else:
        sde_interactive = np.full(shape=(p), fill_value=np.nan)
    one_sim_result = pd.Series(
        [
            *T_N_sim.loc[case, ["T", "N", "sim"]],
            *beta_hat_interactive,
            *beta_hat_within,
            *sde_interactive,
            *sde_within,
        ],
        index=[
            "T",
            "N",
            "sim",
            *paste("beta_interactive", range(1, p + 1), sep="."),
            *paste("beta_within", range(1, p + 1), sep="."),
            *paste("sde_interactive", range(1, p + 1), sep="."),
            *paste("sde_within", range(1, p + 1), sep="."),
        ],
    )
    return one_sim_result


def statistics_coefficient(all_N, all_T, nsims, df_sim_result, **beta_true):
    """
    Generate statistics of each N & T, take the mean of different simulations, and
    store them in a data frame. We include mean, bias, the RMSE, standard error and
    cofidence interval in our statistical results.

    Parameters