예제 #1
0
def task_infected_with_compliance_binomial_regression(depends_on, produces):
    depends_on = BLD / "data" / "infected_merge_data.pickle"
    merge_data = pd.read_pickle(depends_on)
    merge_data["Month"] = pd.Categorical(merge_data["Month"])

    months = ["April", "May", "September"]

    formulas_month = (
        "infected ~ age_cut  + male"
        # " + living_alone"
        "+ living_with_children"
        " + edu + employed + income_hh_cut + working_essential_worker"
        # " + avoid_cafe + avoid_theater + avoid_public_transport + avoid_gym"
        " + compliance_index"
        " + compliance_index*male"
        " + compliance_index*edu"
        # " + compliance_index*employed"
        " + compliance_index*living_with_children"
        # " + compliance_index*living_alone" # ???
        # " + compliance_index*working_essential_worker"
        # " + compliance_index*income_hh_cut"
        # " + compliance_index*age_cut"
    )

    formulas = [formulas_month] * 3 + [formulas_month]
    results = []
    odds_radios = []
    model_names = []
    for i, month in enumerate(months):  # noqa:B007
        merge_data_month = merge_data.query("Month == @month")
        result, summary, odds_radio = _infected_binomial_regression_formula(
            merge_data_month, formulas[i])
        results.append(result)
        odds_radios.append(odds_radio)
        model_names.append(month)

    result, summary, odds_radio = _infected_binomial_regression_formula(
        merge_data, formulas[-1])
    model_names.append("Pooled")
    results.append(result)
    odds_radios.append(odds_radio)

    formated_result = sm_results_format(
        results,
        model_names,
        order=results[-1].summary2().tables[1].index.tolist())
    formated_result = pd.DataFrame(formated_result.tables[0])
    formated_odds_radios = odds_radio_format(odds_radios, model_names)

    formed_ordinal_result = rename_index(formated_result)
    formed_ordinal_result.to_csv(produces["regression"],
                                 float_format="%.3f",
                                 index_label="",
                                 quoting=csv.QUOTE_NONNUMERIC)
    formated_odds_radios = rename_index(formated_odds_radios)
    formated_odds_radios.to_csv(produces["odds_radio"],
                                float_format="%.3f",
                                index_label="",
                                quoting=csv.QUOTE_NONNUMERIC)
def task_infected_panel_regression(depends_on, produces):
    infected_merge_data = pd.read_pickle(depends_on["infected"])
    # infected_merge_data = infected_merge_data.drop(columns=['Month'])
    policy = pd.read_pickle(depends_on["policy"])
    policy_month = policy.groupby("month").mean()
    policy_month["policy_stringency"] = policy_month.sum(axis=1)
    temperature_data = pd.read_pickle(depends_on["temperature"])
    temperature_data = temperature_data.reset_index("City").drop(
        columns=["City", "Month"])
    # infected(i,s,t)~compliance(i,s) + Xi + Policy(s,t) + temperature + fixed effects + interactions
    merge_data = infected_merge_data.join(policy_month, on="month",
                                          how="left").join(temperature_data,
                                                           on="month",
                                                           how="left")
    merge_data.to_csv(produces["data"])
    results = []
    odds_radios = []
    model_names = []
    formula = (
        "infected ~ age_cut + living_alone + living_with_children + male"
        " + edu + employed + income_hh_cut + working_essential_worker"
        " + compliance_index"
        " + policy_stringency"
        # " + LowTemp"
        # " + compliance_index*C(Month)"
        # " + compliance_index*edu"
        # " + compliance_index*living_with_children"
    )
    # run regression
    # result, summary, odds_radio = conditional_logit_regression_formula(merge_data, formula, "Month", method="bfgs")
    result, summary, odds_radio = binomial_logit_regression_formula(
        merge_data, formula, method="bfgs")
    # result, summary, odds_radio = ordinal_regression_formula(merge_data, formula, "logit")
    model_names.append("Pooled")
    results.append(result)
    odds_radios.append(odds_radio)

    formated_result = sm_results_format(results, model_names)
    formated_odds_radios = odds_radio_format(odds_radios, model_names)

    formed_ordinal_result = rename_index(
        pd.DataFrame(formated_result.tables[0]))
    formed_ordinal_result.to_csv(produces["regression"],
                                 float_format="%.3f",
                                 index_label="",
                                 quoting=csv.QUOTE_NONNUMERIC)
    formated_odds_radios = rename_index(formated_odds_radios)
    formated_odds_radios.to_csv(produces["odds_radio"],
                                float_format="%.3f",
                                index_label="",
                                quoting=csv.QUOTE_NONNUMERIC)
예제 #3
0
def task_stat_infected_x_var(depends_on, produces):

    merge_data = pd.read_pickle(depends_on)
    x = merge_data[[
        "female",
        "living_alone",
        "living_with_children",
        "employed",
        "working_essential_worker",
    ]]

    # change category to dummy
    x = x.join(
        pd.get_dummies(
            merge_data["edu"].cat.remove_unused_categories(),
            prefix="education",
            drop_first=False,
            prefix_sep=":",
        ))
    x = x.join(
        pd.get_dummies(merge_data["age_cut"],
                       prefix="age",
                       drop_first=False,
                       prefix_sep=":"))
    x = x.join(
        pd.get_dummies(
            merge_data["income_hh_cut"],
            prefix="income",
            drop_first=False,
            prefix_sep=":",
        ))

    # x_var = (
    #     x.reset_index(level="personal_id")
    #     .drop_duplicates(inplace=False)
    #     .drop(columns="personal_id")
    # )
    x_var = x
    df_desc_stat = pd.DataFrame({
        "Obs.": x_var.count(),
        "Mean": x_var.mean(),
        "Std.": x_var.std(),
        "Min": x_var.min(),
        "Max": x_var.max(),
        "Sum": x_var.sum(),
    })
    rename_index(df_desc_stat).to_csv(produces,
                                      float_format="%.3f",
                                      quoting=csv.QUOTE_NONNUMERIC)
def task_infected_regression(depends_on, produces):
    merge_data = pd.read_pickle(depends_on)

    months = (merge_data.index.get_level_values(
        "month").drop_duplicates().sort_values().month_name().tolist())
    results = []
    odds_radios = []
    model_names = []
    for month in months:  # noqa:B007
        merge_data_month = merge_data.query("Month == @month")
        result, summary, odds_radio = _infected_ordinal_regression_formula(
            merge_data_month)
        results.append(result)
        odds_radios.append(odds_radio)
        model_names.append(month)

    result, summary, odds_radio = _infected_ordinal_regression_formula(
        merge_data)
    model_names.append("Pooled")
    results.append(result)
    odds_radios.append(odds_radio)

    formated_result = sm_results_format(results, model_names)
    formated_odds_radios = odds_radio_format(odds_radios, model_names)

    formed_ordinal_result = rename_index(
        pd.DataFrame(formated_result.tables[0]))
    formed_ordinal_result.to_csv(produces["regression"],
                                 float_format="%.3f",
                                 index_label="",
                                 quoting=csv.QUOTE_NONNUMERIC)
    formated_odds_radios = rename_index(formated_odds_radios)
    formated_odds_radios.to_csv(produces["odds_radio"],
                                float_format="%.3f",
                                index_label="",
                                quoting=csv.QUOTE_NONNUMERIC)
def task_stat_compliance_x_var(depends_on, produces):
    merge_data = pd.read_pickle(depends_on)

    x_var = pd.concat(
        [
            merge_data[[
                "female", "age", "living_alone", "living_with_children"
            ]],
            pd.get_dummies(merge_data["edu"], "education", ":"),
            merge_data[[
                "employed",
                "net_income_hh_eqv",
                "working_essential_worker",
                "extraversion",
                "openness",
                "conscientiousness",
                "agreeableness",
                "neuroticism",
                "ideology",
                "trust_gov",
            ]],
        ],
        axis=1,
    )

    x_var = (x_var.reset_index(level="personal_id").drop_duplicates(
        inplace=False).drop(columns="personal_id"))
    df_desc_stat = pd.DataFrame({
        "Obs.": x_var.count(),
        "Mean": x_var.mean(),
        "Std.": x_var.std(),
        "Min": x_var.min().astype(int),
        "Max": x_var.max(),
        "Sum": x_var.sum(),
    })
    df_desc_stat = rename_index(df_desc_stat)
    df_desc_stat.to_csv(produces,
                        float_format="%.3f",
                        index_label="",
                        quoting=csv.QUOTE_NONNUMERIC)
예제 #6
0
def task_compliance_regression(depends_on, produces):
    merge_data = pd.read_pickle(depends_on)

    # run regression
    models = ["(A)", "(B)", "(C)", "(D)"]
    formula1 = (  # I(net_income_hh_eqv/1000)
        "compliance_index ~ age_by100 + female + living_alone + living_with_children + "
        "edu + employed + I(net_income_hh_eqv/1000)"
        " + working_essential_worker")
    formula2 = ("compliance_index ~" + add_poly_formula("age_by100", 3) +
                "+ female + living_alone + living_with_children + "
                "edu + employed + I(net_income_hh_eqv/1000)"
                " + working_essential_worker")
    formula3 = (
        formula2 +
        " + extraversion + openness + conscientiousness + agreeableness + neuroticism"
    )
    formula4 = formula3 + " + ideology" " + I(ideology ** 2)" " + trust_gov"
    formulas = [formula1, formula2, formula3, formula4]

    # ordinal logit model
    ordinal_logit_results, _, ordinal_logit_odds_radios = map(
        list,
        zip(*[
            ordinal_regression_formula(merge_data, formula, "logit")
            for formula in formulas
        ]),
    )

    formed_odds_radios_logit = odds_radio_format(ordinal_logit_odds_radios,
                                                 models)
    formed_odds_radios_logit = rename_index(formed_odds_radios_logit)

    formed_ordinal_logit_result = sm_results_format(ordinal_logit_results,
                                                    models, var_order)
    formed_ordinal_logit_result = pd.DataFrame(
        formed_ordinal_logit_result.tables[0])
    accuracy_logit = [
        ordinal_regression_accuracy(result) for result in ordinal_logit_results
    ]
    accuracy_logit = [f"{i*100:.2f}%" for i in accuracy_logit]
    formed_ordinal_logit_result = formed_ordinal_logit_result.append(
        pd.DataFrame(
            [accuracy_logit],
            index=["accuracy"],
            columns=formed_ordinal_logit_result.columns,
        ))
    formed_ordinal_logit_result = rename_index(formed_ordinal_logit_result)

    formed_ordinal_logit_result.to_csv(produces["regression_ordered_logit"],
                                       float_format="%.3f",
                                       index_label="",
                                       quoting=csv.QUOTE_NONNUMERIC)
    formed_odds_radios_logit.to_csv(produces["odds_radio_logit"],
                                    float_format="%.3f",
                                    index_label="",
                                    quoting=csv.QUOTE_NONNUMERIC)

    # ordinal probit model
    ordinal_probit_results, _, ordinal_probit_odds_radios = map(
        list,
        zip(*[
            ordinal_regression_formula(merge_data, formula, "probit")
            for formula in formulas
        ]),
    )

    formed_odds_radios_probit = odds_radio_format(ordinal_probit_odds_radios,
                                                  models)
    formed_odds_radios_probit = rename_index(formed_odds_radios_probit)

    formed_ordinal_probit_result = sm_results_format(ordinal_probit_results,
                                                     models, var_order)
    formed_ordinal_probit_result = pd.DataFrame(
        formed_ordinal_probit_result.tables[0])
    accuracy_probit = [
        ordinal_regression_accuracy(result)
        for result in ordinal_probit_results
    ]
    accuracy_probit = [f"{i * 100:.2f}%" for i in accuracy_probit]
    formed_ordinal_probit_result = formed_ordinal_probit_result.append(
        pd.DataFrame(
            [accuracy_probit],
            index=["accuracy"],
            columns=formed_ordinal_probit_result.columns,
        ))
    formed_ordinal_probit_result = rename_index(formed_ordinal_probit_result)

    formed_ordinal_probit_result.to_csv(produces["regression_ordered_probit"],
                                        float_format="%.3f",
                                        index_label="",
                                        quoting=csv.QUOTE_NONNUMERIC)
    formed_odds_radios_probit.to_csv(produces["odds_radio_probit"],
                                     float_format="%.3f",
                                     index_label="",
                                     quoting=csv.QUOTE_NONNUMERIC)

    # ols model
    ols_results, _ = map(
        list,
        zip(*[
            ols_regression_formula(merge_data, formula) for formula in formulas
        ]),
    )
    formed_ols_result = sm_results_format(ols_results, models, var_order)
    formed_ols_result = pd.DataFrame(formed_ols_result.tables[0])
    accuracy_ols = [ols_regression_accuracy(result) for result in ols_results]
    accuracy_ols = [f"{i * 100:.2f}%" for i in accuracy_ols]
    formed_ols_result = formed_ols_result.append(
        pd.DataFrame([accuracy_ols],
                     index=["accuracy"],
                     columns=formed_ols_result.columns))
    formed_ols_result = rename_index(formed_ols_result)

    formed_ols_result.to_csv(produces["regression_ols"],
                             float_format="%.3f",
                             index_label="",
                             quoting=csv.QUOTE_NONNUMERIC)

    # merge all models
    formed_ols_result.columns = pd.MultiIndex.from_product(
        [["OLS"], formed_ols_result.columns])
    formed_ordinal_logit_result.columns = pd.MultiIndex.from_product(
        [["ORDERED LOGIT"], formed_ordinal_logit_result.columns])
    formed_ordinal_probit_result.columns = pd.MultiIndex.from_product(
        [["ORDERED PROBIT"], formed_ordinal_probit_result.columns])

    formed_ols_result = formed_ols_result.reset_index()
    formed_ordinal_logit_result = formed_ordinal_logit_result.reset_index(
    ).set_index(pd.Index(range(2, formed_ordinal_logit_result.shape[0] + 2)))
    formed_ordinal_probit_result = formed_ordinal_probit_result.reset_index(
    ).set_index(pd.Index(range(2, formed_ordinal_probit_result.shape[0] + 2)))

    merge_result = pd.concat(
        [
            formed_ols_result, formed_ordinal_logit_result,
            formed_ordinal_probit_result
        ],
        axis=1,
    )
    merge_result.to_csv(produces["merge_regression_result"],
                        float_format="%.3f",
                        index_label="",
                        index=False,
                        quoting=csv.QUOTE_NONNUMERIC)
def task_compliance_regression_did(depends_on, produces):
    merge_data = pd.read_pickle(depends_on)
    # run regression
    models = ["(B)", "(D)"]
    formula1 = (
        "compliance_index ~ male +"
        "edu + employed + income_hh_cut"
        " + working_essential_worker"
        " + living_with_children"
        " + age_cut"
        " + post*male"
        # " + Month*employed"
        # " + post*income_hh_cut"
        # " + post*working_essential_worker"
        " + post*living_with_children"
        " + post*edu"
        # " + post*age_cut"
        # " + post*living_alone"
    )
    formula2 = (formula1)
    formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism"
    formula4 = (
        formula3 + " + ideology"
        " + I(ideology ** 2)" + " + trust_gov"
        # " + post*trust_gov"
    )
    formulas = [formula2, formula4]

    ols_results, _ = map(
        list,
        zip(*[
            ols_regression_formula(merge_data, formula) for formula in formulas
        ]),
    )
    formed_ols_result = sm_results_format(ols_results, models, order=var_order)

    formed_ols_result = rename_index(pd.DataFrame(formed_ols_result.tables[0]))

    formed_ols_result.to_csv(produces["regression_ols_did"],
                             float_format="%.3f",
                             index_label="",
                             quoting=csv.QUOTE_NONNUMERIC)

    # regression without post
    formula1 = (
        "compliance_index ~ male +"
        "edu + employed + income_hh_cut"
        " + working_essential_worker"
        " + age_cut"
        " + living_with_children"
        # " + living_alone"
    )
    formula2 = (formula1)
    formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism"
    formula4 = formula3 + " + ideology" " + I(ideology ** 2)" + " + trust_gov"
    formulas_without_post = [formula2, formula4]

    ols_without_post_results, _ = map(
        list,
        zip(*[
            ols_regression_formula(merge_data, formula)
            for formula in formulas_without_post
        ]),
    )
    formed_ols_without_post_result = sm_results_format(
        ols_without_post_results, models, order=var_order)

    formed_ols_without_post_result = rename_index(
        pd.DataFrame(formed_ols_without_post_result.tables[0]))

    formed_ols_without_post_result.to_csv(
        produces["regression_ols_did_no_post"],
        float_format="%.3f",
        index_label="",
        quoting=csv.QUOTE_NONNUMERIC)

    # merge all models
    formed_ols_without_post_result.columns = pd.MultiIndex.from_product(
        [["Basic estimation"], formed_ols_without_post_result.columns])
    formed_ols_result.columns = pd.MultiIndex.from_product(
        [["Difference in Difference"], formed_ols_result.columns])

    formed_ols_result = formed_ols_result.reset_index()
    formed_ols_without_post_result = formed_ols_without_post_result.reset_index(
    )

    merge_result = pd.concat(
        [formed_ols_without_post_result, formed_ols_result], axis=1)
    merge_result.to_csv(
        produces["regression_ols_did_merge"],
        float_format="%.3f",
        index_label="",
        index=False,
        quoting=csv.QUOTE_NONNUMERIC,
    )