def compare_input_data( input_h5="./Simulation_engine/dummy_data.h5", input_h5_b="./Simulation_engine/dummy_data.h5", name_variables=("rfr", "irpp", "nbptr"), PERIOD=None, ): if PERIOD is None: PERIOD = annee_de_calcul TBS = FranceTaxBenefitSystem() DUMMY_DATA = pandas.read_hdf(input_h5) simulation_base_deciles, dictionnaire_datagrouped = simulation( PERIOD, DUMMY_DATA, TBS) df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]] for nv in name_variables: df["{}_base".format(nv)] = simulation_base_deciles.calculate( nv, PERIOD) isdif = False data2 = pandas.read_hdf(input_h5_b) col = "b" newsim, ddg2 = simulation(PERIOD, data2, TBS) for nv in name_variables: df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD) isdif |= len(df[df["{}_{}".format(nv, col)] - df["{}_base".format(nv)] > 0.01]) + len( df[df["{}_{}".format(nv, col)] - df["{}_base".format(nv)] < -0.01]) return not isdif
def test_homemade_nbptr_function( reform_config_base_2020, nbptr_parametres_par_defaut, various_cas_types ): # Verifie que les resultats de nbptr et irpp sont les mêmes avec la fonction par defaut period = "2020" data = dataframe_from_cas_types_description(various_cas_types) tbs_reforme_sans_nbptr = IncomeTaxReform( FranceTaxBenefitSystem(), reform_config_base_2020, period ) tbs_reforme_avec_nbptr = IncomeTaxReform( FranceTaxBenefitSystem(), { "impot_revenu": { **(reform_config_base_2020["impot_revenu"]), **nbptr_parametres_par_defaut, } }, period, ) sim_sans_nbptr, _ = simulation(period, data, tbs_reforme_sans_nbptr) sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr) print("sans", sim_sans_nbptr.calculate("nbptr", period)) print("avec", sim_avec_nbptr.calculate("nbptr", period)) assert array_equal( sim_sans_nbptr.calculate("nbptr", period), sim_avec_nbptr.calculate("nbptr", period), ) assert array_equal( sim_sans_nbptr.calculate("irpp", period), sim_avec_nbptr.calculate("irpp", period), )
def generate_default_results(): # Keeping computations short with option to keep file under 1000 FF # DUMMY_DATA = DUMMY_DATA[(DUMMY_DATA["idmen"] > 2500) & (DUMMY_DATA["idmen"] < 7500)] bulk_data_simulation, data_by_entity = simulation(PERIOD, DUMMY_DATA, TBS) # precalcul cas de base sur la population pour le cache base_results = data_by_entity["foyer_fiscal"][["wprm", "idfoy"]] base_results["avant"] = bulk_data_simulation.calculate("irpp", PERIOD) simulation_plf_deciles = simulation(PERIOD, DUMMY_DATA, TBS_PLF) base_results["plf"] = simulation_plf_deciles[0].calculate("irpp", PERIOD) base_results[["idfoy", "avant", "plf", "wprm"]].to_csv( "base_results.csv", index=False ) return base_results
def test_sim_pop_dict_content(reform): simulation_reform = simulation(PERIOD, DUMMY_DATA, reform) comp_result = compare(PERIOD, {"apres": simulation_reform}) assert "total" in comp_result assert "deciles" in comp_result assert "frontieres_deciles" in comp_result assert len(comp_result["frontieres_deciles"]) == len( comp_result["deciles"]) assert "foyers_fiscaux_touches" in comp_result # assert len(comp_result["deciles"])==10 Removed cause with the cas type description for key in ["avant", "apres", "plf"]: assert key in comp_result["total"] assert key in comp_result["deciles"][0] for key in ["avant_to_apres", "avant_to_plf", "plf_to_apres"]: assert key in comp_result["foyers_fiscaux_touches"] for type_touche, nb_people in comp_result["foyers_fiscaux_touches"][ key].items(): assert type_touche in [ "gagnant", "neutre", "perdant", "perdant_zero", "neutre_zero", ] assert isinstance(nb_people, int)
def test_sim_pop_dict_content(reform, requested_simulations): simulation_reform = simulation(PERIOD, DUMMY_DATA, reform) comp_result = compare(PERIOD, {"apres": simulation_reform}) assert "total" in comp_result assert "deciles" in comp_result assert "frontieres_deciles" in comp_result assert len(comp_result["frontieres_deciles"]) == len( comp_result["deciles"]) assert "foyers_fiscaux_touches" in comp_result # assert len(comp_result["deciles"])==10 Removed cause with the cas type description for key in requested_simulations: assert key in comp_result["total"] assert key in comp_result["deciles"][0] for index_key_1 in range(len(requested_simulations)): for index_key_2 in range(index_key_1 + 1, len(requested_simulations)): key = (requested_simulations[index_key_1] + "_to_" + requested_simulations[index_key_2]) # list of keys checked can be for example ["avant_to_apres", "avant_to_plf", "plf_to_apres"] assert key in comp_result["foyers_fiscaux_touches"] for type_touche, nb_people in comp_result[ "foyers_fiscaux_touches"][key].items(): assert type_touche in [ "gagnant", "neutre", "perdant", "perdant_zero", "neutre_zero", ] assert isinstance(nb_people, int)
def test_sim_base_cas_types_dict_content_ok(reform, requested_simulations): simulation_reform = simulation(PERIOD, CAS_TYPE, reform) simulations_cas_types = simulations_reformes_par_defaut_castypes simulations_cas_types["apres"] = simulation_reform comp_result = compare(PERIOD, simulations_cas_types, compute_deciles=False) assert "total" in comp_result assert "res_brut" in comp_result # assert len(comp_result["deciles"])==10 Removed cause with the cas type description for key in requested_simulations: assert key in comp_result["total"] assert key in comp_result["res_brut"] assert len(comp_result["res_brut"][key]) == 6
def generate_default_results(): # precalcul cas de base sur la population pour le cache base_results = None liste_base_reformes = [] for reforme in TBS_DEFAULT: liste_base_reformes += [reforme] bulk_data_simulation, data_by_entity = simulation( PERIOD, DUMMY_DATA, TBS_DEFAULT[reforme]) if base_results is None: base_results = data_by_entity["foyer_fiscal"][["wprm", "idfoy"]] base_results[reforme] = bulk_data_simulation.calculate("irpp", PERIOD) base_results[["idfoy"] + liste_base_reformes + ["wprm"]].to_csv( "base_results.csv", index=False) return base_results
def test_zero_nbptr(reform_config_base_2020, nbptr_zero, various_cas_types): # Verifie que les resultats de nbptr sont bien zero pour tout le monde si tous les param # sont à zéro period = "2020" data = dataframe_from_cas_types_description(various_cas_types) tbs_reforme_avec_nbptr = IncomeTaxReform( FranceTaxBenefitSystem(), {"impot_revenu": {**(reform_config_base_2020["impot_revenu"]), **nbptr_zero}}, period, ) sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr) resultats_nbptr = sim_avec_nbptr.calculate("nbptr", period) assert not resultats_nbptr.any()
def test_deux_adultes_ancien_combattants_deux_enfants(reform_config_base_2020): # données foyer = { "declarants": [ { "ancienCombattant": True, "invalide": False, "parentIsole": False, "retraite": False, "veuf": False }, { "ancienCombattant": True, "invalide": False, "parentIsole": False, "retraite": False, "veuf": False } ], "personnesACharge": [ { "chargePartagee": False, "invalide": False }, { "chargePartagee": False, "invalide": False } ], "residence": "metropole", "revenuImposable": 120000 } data = dataframe_from_cas_types_description([foyer]) period = "2020" # loi française + réforme IR tbs_reforme_impot_revenu = IncomeTaxReform( FranceTaxBenefitSystem(), reform_config_base_2020, period ) built_simulation, _dict_data_by_entity = simulation( period, data, tbs_reforme_impot_revenu ) nbptr = built_simulation.calculate("nbptr", period) assert nbptr == [3.5]
def test_sim_base_cas_types_dict_content_ok(reform): simulation_reform = simulation(PERIOD, CAS_TYPE, reform) comp_result = compare( PERIOD, { "avant": simulation_base_castypes, "plf": simulation_plf_castypes, "apres": simulation_reform, }, compute_deciles=False, ) assert "total" in comp_result assert "res_brut" in comp_result # assert len(comp_result["deciles"])==10 Removed cause with the cas type description for key in ["avant", "apres", "plf"]: assert key in comp_result["total"] assert key in comp_result["res_brut"] assert len(comp_result["res_brut"][key]) == 6
def ajustement_h5( input_h5="./Simulation_engine/dummy_data.h5", output_h5="./Simulation_engine/dummy_data_ajuste.h5", distribution_rfr_population="./Simulation_engine/Calib/ResFinalCalibSenat.csv", PERIOD=None, ): if PERIOD is None: PERIOD = annee_de_calcul ajuste_h5 = output_h5 TBS = FranceTaxBenefitSystem() DUMMY_DATA = pandas.read_hdf(input_h5) # Keeping computations short with option to keep file under 1000 FF # DUMMY_DATA = DUMMY_DATA[DUMMY_DATA["idmen"] < 1000] simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS) df = aggregats_ff(PERIOD, simulation_base_deciles).sort_values(by="rfr") print("{} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle ".format( len(df[df["rfr"] > 0.01]), len(df), 100 - 100 * len(df[df["rfr"] > 0.01]) / len(df), )) # Step 1 : Ajustement du nombre de mecs à zéro... oldweight = 1 - df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum() targetweight = 0.06 redweightifrfr0 = targetweight * (1 - oldweight) / oldweight / ( 1 - targetweight) print( "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle. Je vais les ajuster." .format( df[df["rfr"] > 0.01]["wprm"].sum(), df["wprm"].sum(), 100 - 100 * df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum(), )) print("old : {} new : {} adj : {}".format(oldweight, targetweight, redweightifrfr0)) # Ajustement de réduction du poids df["adjwstep0"] = 1 df["realwprm"] = df["wprm"] df.loc[df["rfr"] < 0.01, "adjwstep0"] = redweightifrfr0 df.loc[df["rfr"] < 0.01, "realwprm"] = df["wprm"] * redweightifrfr0 # Calibration du nombre total de foyers fiscaux target_foyers_fiscaux = 38_332_977 # src : https://www.impots.gouv.fr/portail/statistiques (2018) adjust_wprm = target_foyers_fiscaux / df["realwprm"].sum() df["realwprm"] = df["realwprm"] * adjust_wprm print( "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle " .format( df[df["rfr"] > 0.01]["wprm"].sum(), df["wprm"].sum(), 100 - 100 * df[df["rfr"] > 0]["wprm"].sum() / df["wprm"].sum(), )) # Step 1.1 : Ajuster le 1er décile (pour l'instant on fait que dalle, y a pas vraiment d'impact # Step 2 : PBP (pareto by parts) # Stats officielles so = pandas.read_csv(distribution_rfr_population) # doit contenir : # Colonne Rk : Revenu Fiscal de référence # Colonne Nk : Pourcentage de foyers fiscaux ayant un RFR >= à la colonne Rk # Colonne Ark : RFR moyen des foyers fiscaux ayant un RFR >= à la colonne Rk (utilisée seulement pour la loi du # plus haut décile # Je vais désormais déterminer la distribution de tout le monde : # 2.0 - bon je vais associer le running weight de chaque mec... totw = df["realwprm"].sum() df = df.sort_values(by="rfr") df["nw"] = df["realwprm"] / totw # normalized weight (total = 1) df["rsnw"] = df["nw"].cumsum( ) - df["nw"] / 2 # somme cumulée des nw. on prend # 2.1 - dans le premier décile : Les valeurs exactes de l'ERFS * un facteur scalaire qui permet de rendre le premier décile = ce que je veux. targetFirstDec = so["Rk"][1] limWeightFirstDec = so["Nk"][1] limOrigFirstDec = max(df[df["rsnw"] <= 1 - limWeightFirstDec]["rfr"]) df["adjrevstep2"] = 1 df.loc[df["rsnw"] <= 1 - limWeightFirstDec, "adjrevstep2"] = (targetFirstDec / limOrigFirstDec) # 2.2 - dans toutes les autres catégories (sauf la dernière) : la distrib restrinte à un intervalle est une loi de Pareto au premier paramètre = le # debut de l'intervalle et deuxième paramètre : celui qui permet d'obtenir le bon nombre de gens dans l'intervalle # Détermination de ce paramètre sonk = so["Nk"].values # parce que je sais toujours pas itérer ligne à ligne dans un DataFrame sork = so["Rk"].values paramsPareto = [-1] for i in range(1, len(sonk) - 1): n0 = sonk[i] n1 = sonk[i + 1] r0 = sork[i] r1 = sork[i + 1] newparam = math.log(n1 / n0) / math.log(r0 / r1) paramsPareto += [newparam] # 2.3 - dans la dernière catégorie : je prend le param de la loi de Pareto qui permet d'égaliser la moyenne de la dernière tranche # OK la moyenne d'une Pareto est : esp = (1 + 1/(k-1)) * xm # k = 1/(esp/xm - 1) + 1 lastaverage = so["dArk"].values[-1] * 1000 lastthresh = sork[-1] paramsPareto += [1 / (lastaverage / lastthresh - 1) + 1] so["paramPareto"] = paramsPareto df["realrfr"] = df.apply(reverseCDF(so), axis=1) df["realrfrw"] = df["realrfr"] * df["realwprm"] # OK now that this great function works (does it? Why not try it? comparing it now to the original function??) # I can generate the REAL rfr # End of step 2. testerrorvalues(df, "rfr", "wprm") aa = testerrorvalues(df, "realrfr", "realwprm") print("Aggregated Error % after calibration :", aa) # OKOK bon maintenant mon df contient le bon rfr et le bon realwprm df["total_ajust_revenu"] = 1 df.loc[df["rfr"] > 0, "total_ajust_revenu"] = df["realrfr"] / df["rfr"] df["total_ajust_poids"] = df["realwprm"] / df["wprm"] # Je vais ajuster le .h5 to_transform = pandas.read_hdf(input_h5) tt_colonnes = to_transform.columns df_changes = df[["idfoy", "total_ajust_revenu", "total_ajust_poids"]] to_transform = to_transform.merge(df_changes, on="idfoy") colspoids = ["wprm"] colsrevenus = [ "chomage_brut", "pensions_alimentaires_percues", "rag", "ric", "rnc", "salaire_de_base", "f4ba", # "loyer", # "taxe_habitation", ] colsrevenus = [col for col in colsrevenus if col in to_transform.columns] for cp in colspoids: to_transform[cp] = to_transform[cp] * to_transform["total_ajust_poids"] for cp in colsrevenus: to_transform[ cp] = to_transform[cp] * to_transform["total_ajust_revenu"] to_transform = to_transform[tt_colonnes] to_transform.to_hdf(ajuste_h5, key="input")
def test_h5_input( input_h5="./Simulation_engine/dummy_data.h5", name_variables=("rfr", "irpp", "nbptr"), aggfunc="sum", compdic=None, is_plf=False, PERIOD=None, ): if PERIOD is None: PERIOD = annee_de_calcul TBS = TBS_DEFAULT["plf"] if is_plf else FranceTaxBenefitSystem() DUMMY_DATA = pandas.read_hdf(input_h5) simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS) df = aggregats_ff(PERIOD, simulation_base_deciles, name_variables).sort_values(by="rfr") if aggfunc == "sum": # Pour la somme, on calcule les % d'erreur sur la répartition. testerrorvalues(df) aggs_to_compute = ["wprm", "salaire_de_base", "retraite_brute" ] + list(name_variables) val_donnees_pac_agg = 0 trpac_agg = [ compdic[ag] for ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"] if compdic is not None and ag in compdic ] val_reelle_pac_agg = sum(trpac_agg) if len(trpac_agg) else None for ag in aggs_to_compute: if aggfunc == "sum": nom_a_afficher = "Total aggrégé" if ag != "wprm": val_donnees = (df[ag] * df["wprm"]).sum() else: val_donnees = (df[ag]).sum() elif aggfunc == "countnonzero": if ag != "wprm": nom_a_afficher = "Non nuls" val_donnees = (df[df[ag] != 0]["wprm"]).sum() else: nom_a_afficher = "Nombre FF (c'est comme ça le count sur wprm)" val_donnees = df[ag].count() else: raise ( "Only aggregation functions supported are sum and countnonzero. The rest is not very good if you want my opinion" ) val_reelle = compdic[ ag] if compdic is not None and ag in compdic else None print("{} {} : {:.0f} {} {}".format( nom_a_afficher, ag, val_donnees, val_reelle if val_reelle is not None else "", "{:.2f}%".format((val_donnees / val_reelle - 1) * 100) if val_reelle is not None else "", )) if ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"]: val_donnees_pac_agg += val_donnees if val_reelle_pac_agg is not None: print("{} {} : {:.0f} {} {}".format( nom_a_afficher, "Enfants cumules", val_donnees_pac_agg, val_reelle_pac_agg if val_reelle_pac_agg is not None else "", "{:.2f}%".format((val_donnees_pac_agg / val_reelle_pac_agg - 1) * 100) if val_reelle_pac_agg is not None else "", ))
def test_useless_variables( input_h5="./Simulation_engine/dummy_data.h5", outfile_path=None, name_variables=("rfr", "irpp", "nbptr"), PERIOD=None, ): if PERIOD is None: PERIOD = annee_de_calcul pandas.options.mode.chained_assignment = None list_useless_variables = [] TBS = FranceTaxBenefitSystem() DUMMY_DATA = pandas.read_hdf(input_h5) simulation_base_deciles, dictionnaire_datagrouped = simulation( PERIOD, DUMMY_DATA, TBS) df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]] for nv in name_variables: df["{}_base".format(nv)] = simulation_base_deciles.calculate( nv, PERIOD) for col in DUMMY_DATA.columns: if col == "wprm": # we don't want to remove this one continue isdif = False data_wo_column = DUMMY_DATA[[ k for k in DUMMY_DATA.columns if k != col ]] try: newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS) resvar = {nv: {} for nv in name_variables} for nv in name_variables: df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD) resvar[nv]["countdif"] = len(df[ df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]]) # print(col,nv,resvar[nv]["countdif"]) # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]])) isdif |= resvar[nv]["countdif"] if not isdif: list_useless_variables += [col] print( col, "is", "not" if isdif else "", "useless", "{}".format([resvar[nv]["countdif"] for nv in name_variables]) if isdif else "", ) except Exception: print(col, "is definitely not useless") data_wo_useless = DUMMY_DATA[[ k for k in DUMMY_DATA.columns if k not in list_useless_variables ]] newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS) isdif = False for nv in name_variables: # print(col,nv,resvar[nv]["countdif"]) # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]])) isdif |= len( df[df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]]) if isdif: print( "Removing all variables at once didn't work, good luck with that") else: if outfile_path is None: outfile_path = input_h5.replace(".h5", "_useful.h5") data_wo_useless.to_hdf(outfile_path, key="input") print( "It seems lots of columns don't do anything. Data with only useful columns was exported to {}" .format(outfile_path)) return list_useless_variables