def task_infected_with_compliance_binomial_regression(depends_on, produces): depends_on = BLD / "data" / "infected_merge_data.pickle" merge_data = pd.read_pickle(depends_on) merge_data["Month"] = pd.Categorical(merge_data["Month"]) months = ["April", "May", "September"] formulas_month = ( "infected ~ age_cut + male" # " + living_alone" "+ living_with_children" " + edu + employed + income_hh_cut + working_essential_worker" # " + avoid_cafe + avoid_theater + avoid_public_transport + avoid_gym" " + compliance_index" " + compliance_index*male" " + compliance_index*edu" # " + compliance_index*employed" " + compliance_index*living_with_children" # " + compliance_index*living_alone" # ??? # " + compliance_index*working_essential_worker" # " + compliance_index*income_hh_cut" # " + compliance_index*age_cut" ) formulas = [formulas_month] * 3 + [formulas_month] results = [] odds_radios = [] model_names = [] for i, month in enumerate(months): # noqa:B007 merge_data_month = merge_data.query("Month == @month") result, summary, odds_radio = _infected_binomial_regression_formula( merge_data_month, formulas[i]) results.append(result) odds_radios.append(odds_radio) model_names.append(month) result, summary, odds_radio = _infected_binomial_regression_formula( merge_data, formulas[-1]) model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format( results, model_names, order=results[-1].summary2().tables[1].index.tolist()) formated_result = pd.DataFrame(formated_result.tables[0]) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index(formated_result) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_infected_panel_regression(depends_on, produces): infected_merge_data = pd.read_pickle(depends_on["infected"]) # infected_merge_data = infected_merge_data.drop(columns=['Month']) policy = pd.read_pickle(depends_on["policy"]) policy_month = policy.groupby("month").mean() policy_month["policy_stringency"] = policy_month.sum(axis=1) temperature_data = pd.read_pickle(depends_on["temperature"]) temperature_data = temperature_data.reset_index("City").drop( columns=["City", "Month"]) # infected(i,s,t)~compliance(i,s) + Xi + Policy(s,t) + temperature + fixed effects + interactions merge_data = infected_merge_data.join(policy_month, on="month", how="left").join(temperature_data, on="month", how="left") merge_data.to_csv(produces["data"]) results = [] odds_radios = [] model_names = [] formula = ( "infected ~ age_cut + living_alone + living_with_children + male" " + edu + employed + income_hh_cut + working_essential_worker" " + compliance_index" " + policy_stringency" # " + LowTemp" # " + compliance_index*C(Month)" # " + compliance_index*edu" # " + compliance_index*living_with_children" ) # run regression # result, summary, odds_radio = conditional_logit_regression_formula(merge_data, formula, "Month", method="bfgs") result, summary, odds_radio = binomial_logit_regression_formula( merge_data, formula, method="bfgs") # result, summary, odds_radio = ordinal_regression_formula(merge_data, formula, "logit") model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format(results, model_names) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index( pd.DataFrame(formated_result.tables[0])) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_stat_infected_x_var(depends_on, produces): merge_data = pd.read_pickle(depends_on) x = merge_data[[ "female", "living_alone", "living_with_children", "employed", "working_essential_worker", ]] # change category to dummy x = x.join( pd.get_dummies( merge_data["edu"].cat.remove_unused_categories(), prefix="education", drop_first=False, prefix_sep=":", )) x = x.join( pd.get_dummies(merge_data["age_cut"], prefix="age", drop_first=False, prefix_sep=":")) x = x.join( pd.get_dummies( merge_data["income_hh_cut"], prefix="income", drop_first=False, prefix_sep=":", )) # x_var = ( # x.reset_index(level="personal_id") # .drop_duplicates(inplace=False) # .drop(columns="personal_id") # ) x_var = x df_desc_stat = pd.DataFrame({ "Obs.": x_var.count(), "Mean": x_var.mean(), "Std.": x_var.std(), "Min": x_var.min(), "Max": x_var.max(), "Sum": x_var.sum(), }) rename_index(df_desc_stat).to_csv(produces, float_format="%.3f", quoting=csv.QUOTE_NONNUMERIC)
def task_infected_regression(depends_on, produces): merge_data = pd.read_pickle(depends_on) months = (merge_data.index.get_level_values( "month").drop_duplicates().sort_values().month_name().tolist()) results = [] odds_radios = [] model_names = [] for month in months: # noqa:B007 merge_data_month = merge_data.query("Month == @month") result, summary, odds_radio = _infected_ordinal_regression_formula( merge_data_month) results.append(result) odds_radios.append(odds_radio) model_names.append(month) result, summary, odds_radio = _infected_ordinal_regression_formula( merge_data) model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format(results, model_names) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index( pd.DataFrame(formated_result.tables[0])) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_stat_compliance_x_var(depends_on, produces): merge_data = pd.read_pickle(depends_on) x_var = pd.concat( [ merge_data[[ "female", "age", "living_alone", "living_with_children" ]], pd.get_dummies(merge_data["edu"], "education", ":"), merge_data[[ "employed", "net_income_hh_eqv", "working_essential_worker", "extraversion", "openness", "conscientiousness", "agreeableness", "neuroticism", "ideology", "trust_gov", ]], ], axis=1, ) x_var = (x_var.reset_index(level="personal_id").drop_duplicates( inplace=False).drop(columns="personal_id")) df_desc_stat = pd.DataFrame({ "Obs.": x_var.count(), "Mean": x_var.mean(), "Std.": x_var.std(), "Min": x_var.min().astype(int), "Max": x_var.max(), "Sum": x_var.sum(), }) df_desc_stat = rename_index(df_desc_stat) df_desc_stat.to_csv(produces, float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_compliance_regression(depends_on, produces): merge_data = pd.read_pickle(depends_on) # run regression models = ["(A)", "(B)", "(C)", "(D)"] formula1 = ( # I(net_income_hh_eqv/1000) "compliance_index ~ age_by100 + female + living_alone + living_with_children + " "edu + employed + I(net_income_hh_eqv/1000)" " + working_essential_worker") formula2 = ("compliance_index ~" + add_poly_formula("age_by100", 3) + "+ female + living_alone + living_with_children + " "edu + employed + I(net_income_hh_eqv/1000)" " + working_essential_worker") formula3 = ( formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" ) formula4 = formula3 + " + ideology" " + I(ideology ** 2)" " + trust_gov" formulas = [formula1, formula2, formula3, formula4] # ordinal logit model ordinal_logit_results, _, ordinal_logit_odds_radios = map( list, zip(*[ ordinal_regression_formula(merge_data, formula, "logit") for formula in formulas ]), ) formed_odds_radios_logit = odds_radio_format(ordinal_logit_odds_radios, models) formed_odds_radios_logit = rename_index(formed_odds_radios_logit) formed_ordinal_logit_result = sm_results_format(ordinal_logit_results, models, var_order) formed_ordinal_logit_result = pd.DataFrame( formed_ordinal_logit_result.tables[0]) accuracy_logit = [ ordinal_regression_accuracy(result) for result in ordinal_logit_results ] accuracy_logit = [f"{i*100:.2f}%" for i in accuracy_logit] formed_ordinal_logit_result = formed_ordinal_logit_result.append( pd.DataFrame( [accuracy_logit], index=["accuracy"], columns=formed_ordinal_logit_result.columns, )) formed_ordinal_logit_result = rename_index(formed_ordinal_logit_result) formed_ordinal_logit_result.to_csv(produces["regression_ordered_logit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formed_odds_radios_logit.to_csv(produces["odds_radio_logit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # ordinal probit model ordinal_probit_results, _, ordinal_probit_odds_radios = map( list, zip(*[ ordinal_regression_formula(merge_data, formula, "probit") for formula in formulas ]), ) formed_odds_radios_probit = odds_radio_format(ordinal_probit_odds_radios, models) formed_odds_radios_probit = rename_index(formed_odds_radios_probit) formed_ordinal_probit_result = sm_results_format(ordinal_probit_results, models, var_order) formed_ordinal_probit_result = pd.DataFrame( formed_ordinal_probit_result.tables[0]) accuracy_probit = [ ordinal_regression_accuracy(result) for result in ordinal_probit_results ] accuracy_probit = [f"{i * 100:.2f}%" for i in accuracy_probit] formed_ordinal_probit_result = formed_ordinal_probit_result.append( pd.DataFrame( [accuracy_probit], index=["accuracy"], columns=formed_ordinal_probit_result.columns, )) formed_ordinal_probit_result = rename_index(formed_ordinal_probit_result) formed_ordinal_probit_result.to_csv(produces["regression_ordered_probit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formed_odds_radios_probit.to_csv(produces["odds_radio_probit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # ols model ols_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas ]), ) formed_ols_result = sm_results_format(ols_results, models, var_order) formed_ols_result = pd.DataFrame(formed_ols_result.tables[0]) accuracy_ols = [ols_regression_accuracy(result) for result in ols_results] accuracy_ols = [f"{i * 100:.2f}%" for i in accuracy_ols] formed_ols_result = formed_ols_result.append( pd.DataFrame([accuracy_ols], index=["accuracy"], columns=formed_ols_result.columns)) formed_ols_result = rename_index(formed_ols_result) formed_ols_result.to_csv(produces["regression_ols"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # merge all models formed_ols_result.columns = pd.MultiIndex.from_product( [["OLS"], formed_ols_result.columns]) formed_ordinal_logit_result.columns = pd.MultiIndex.from_product( [["ORDERED LOGIT"], formed_ordinal_logit_result.columns]) formed_ordinal_probit_result.columns = pd.MultiIndex.from_product( [["ORDERED PROBIT"], formed_ordinal_probit_result.columns]) formed_ols_result = formed_ols_result.reset_index() formed_ordinal_logit_result = formed_ordinal_logit_result.reset_index( ).set_index(pd.Index(range(2, formed_ordinal_logit_result.shape[0] + 2))) formed_ordinal_probit_result = formed_ordinal_probit_result.reset_index( ).set_index(pd.Index(range(2, formed_ordinal_probit_result.shape[0] + 2))) merge_result = pd.concat( [ formed_ols_result, formed_ordinal_logit_result, formed_ordinal_probit_result ], axis=1, ) merge_result.to_csv(produces["merge_regression_result"], float_format="%.3f", index_label="", index=False, quoting=csv.QUOTE_NONNUMERIC)
def task_compliance_regression_did(depends_on, produces): merge_data = pd.read_pickle(depends_on) # run regression models = ["(B)", "(D)"] formula1 = ( "compliance_index ~ male +" "edu + employed + income_hh_cut" " + working_essential_worker" " + living_with_children" " + age_cut" " + post*male" # " + Month*employed" # " + post*income_hh_cut" # " + post*working_essential_worker" " + post*living_with_children" " + post*edu" # " + post*age_cut" # " + post*living_alone" ) formula2 = (formula1) formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" formula4 = ( formula3 + " + ideology" " + I(ideology ** 2)" + " + trust_gov" # " + post*trust_gov" ) formulas = [formula2, formula4] ols_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas ]), ) formed_ols_result = sm_results_format(ols_results, models, order=var_order) formed_ols_result = rename_index(pd.DataFrame(formed_ols_result.tables[0])) formed_ols_result.to_csv(produces["regression_ols_did"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # regression without post formula1 = ( "compliance_index ~ male +" "edu + employed + income_hh_cut" " + working_essential_worker" " + age_cut" " + living_with_children" # " + living_alone" ) formula2 = (formula1) formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" formula4 = formula3 + " + ideology" " + I(ideology ** 2)" + " + trust_gov" formulas_without_post = [formula2, formula4] ols_without_post_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas_without_post ]), ) formed_ols_without_post_result = sm_results_format( ols_without_post_results, models, order=var_order) formed_ols_without_post_result = rename_index( pd.DataFrame(formed_ols_without_post_result.tables[0])) formed_ols_without_post_result.to_csv( produces["regression_ols_did_no_post"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # merge all models formed_ols_without_post_result.columns = pd.MultiIndex.from_product( [["Basic estimation"], formed_ols_without_post_result.columns]) formed_ols_result.columns = pd.MultiIndex.from_product( [["Difference in Difference"], formed_ols_result.columns]) formed_ols_result = formed_ols_result.reset_index() formed_ols_without_post_result = formed_ols_without_post_result.reset_index( ) merge_result = pd.concat( [formed_ols_without_post_result, formed_ols_result], axis=1) merge_result.to_csv( produces["regression_ols_did_merge"], float_format="%.3f", index_label="", index=False, quoting=csv.QUOTE_NONNUMERIC, )