def build_aggregates(): # writer = None years = range(2009, 2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() simu.set_survey() simu.compute() variables = ["garext", "ci_garext", "inthab", "ppe_brute", "rni"] x = simu.aggregated_by_entity("men", variables, all_output_vars=False) # df = x[0] # print df["ci_garext"].describe() agg = Aggregates() agg.set_simulation(simu) agg.show_default = False agg.show_real = False agg.show_diff = False agg.varlist = var_list # ERROR: var_list is undefined. agg.compute() cols = agg.aggr_frame.columns[:4] print agg.aggr_frame[cols].to_string() # if writer is None: # writer = ExcelWriter(str(fname_all)) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) del simu del agg import gc gc.collect()
def test(): from openfisca_core.simulations import SurveySimulation from .aggregates import Aggregates yr = 2006 simulation = SurveySimulation() simulation.set_config(year = yr) simulation.set_param() simulation.set_survey() calibration = Calibration() calibration.set_simulation(simulation) filename = "../../countries/france/calibrations/calib_2006.csv" calibration.set_inputs_margins_from_file(filename, 2006) calibration.set_param('invlo', 3) calibration.set_param('up', 3) calibration.set_param('method', 'logit') aggregates = Aggregates() aggregates.set_simulation(simulation) simulation.compute() aggregates.compute() print aggregates.aggr_frame.to_string() calibration.set_calibrated_weights() simulation.compute() aggregates.compute() print aggregates.aggr_frame.to_string()
def build_aggregates(): writer = None years = range(2006, 2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() simu.set_survey() inflator = get_loyer_inflator(year) simu.inflate_survey({'loyer': inflator}) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() if writer is None: writer = ExcelWriter(str(fname_all)) agg.aggr_frame.to_excel(writer, yr, index=False, header=True, float_format="%.2f") print agg.aggr_frame.to_string() del simu del agg import gc gc.collect() writer.save()
def get_common_dataframe(variables, year=2006): """ Compare variables in erf an openfisca """ simulation = SurveySimulation() simulation.set_config(year=year) simulation.set_param() simulation.set_survey() simulation.compute() erf = ErfsDataTable(year=2006) if "ident" not in variables: erf_variables = variables + ["ident"] else: erf_variables = variables if "wprm" not in erf_variables: erf_variables = erf_variables + ["wprm"] else: erf_variables = erf_variables erf_dataframe = erf.get_values(erf_variables, table="menage") erf_dataframe.rename(columns={'ident': 'idmen'}, inplace=True) for col in erf_dataframe.columns: if col is not "idmen": erf_dataframe.rename(columns={col: col + "_erf"}, inplace=True) of_dataframe, of_dataframe_default = simulation.aggregated_by_entity( "men", variables, all_output_vars=False, force_sum=True) del of_dataframe_default merged_df = of_dataframe.merge(erf_dataframe, on="idmen") del of_dataframe, erf_dataframe return merged_df
def survey_case(year): # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() simulation.set_config(year=year, num_table=1, reforme=True) simulation.set_param() simulation.P.ir.autre.charge_loyer.plaf = 500 simulation.P.ir.autre.charge_loyer.active = 1 simulation.P.ir.autre.charge_loyer.plaf_nbp = 0 # plaf=1000 plaf_nbp=0: -42160, =1: -41292 # plaf=500 plaf_nbp=0: -43033, =1: -42292 # Bareme threshold reduction in pct reduc = .1 print simulation.P.ir.bareme print simulation.P.ir.bareme.nb for i in range(2, simulation.P.ir.bareme.nb): simulation.P.ir.bareme.setSeuil( i, simulation.P.ir.bareme.seuils[i] * (1 - reduc)) print simulation.P.ir.bareme print simulation.P.ir.bareme.nb simulation.compute() # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.to_string() return
def test_chunk(): print "debut" writer = None years = range(2011, 2012) filename = destination_dir + 'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time tps = {} for nb_chunk in range(1, 5): deb_chunk = time.clock() simu.set_config(survey_filename='C:\\Til\\output\\to_run_leg.h5', num_table=3, chunks_count=nb_chunk, print_missing=False) simu.compute() tps[nb_chunk] = time.clock() - deb_chunk voir = simu.output_table.table3['foy'] print len(voir) pdb.set_trace() agg3 = Aggregates() agg3.set_simulation(simu) agg3.compute() df1 = agg3.aggr_frame print df1.to_string() print tps store.close()
def convert_to_3_tables(year=2006, survey_file=None, output_file=None): if survey_file is None: raise Exception( 'You need a .h5 file with the survey to extract the variables from' ) if output_file is None: output_file = survey_file raise Warning( 'the survey file will be used to store the created tables') store = HDFStore(survey_file) output = HDFStore(output_file) print output simulation = SurveySimulation() simulation.set_config(year=year) table1 = store['survey_' + str(year)] for entity in ['ind', 'foy', 'men', 'fam']: key = 'survey_' + str(year) + '/' + str(entity) vars_matching_entity = vars_matching_entity_from_table( table1, simulation, entity) print entity, vars_matching_entity_from_table print 'table1 enum' if entity == 'ind': print 'INDIVIDUALS' print table1['noindiv'] table_entity = table1.loc[:, vars_matching_entity] # we take care have all ident and selecting qui==0 else: # print ' entity :', entity # print table1['noindiv'].head() position = 'qui' + entity # print table1[position] table_entity = table1.ix[table1[position] == 0, [ 'noi', 'idmen', 'idfoy', 'idfam', 'quifoy', 'quimen', 'quifam' ] + vars_matching_entity] # print table_entity.noi.head() table_entity = table_entity.rename_axis(table_entity['id' + entity], axis=1) # print ' APRES' # print table_entity.noi.head() print key output.put(key, table_entity) del table1 import gc gc.collect() store.close() output.close()
def test3(): year = 2006 erf = DataCollection(year=year) df = erf.get_of_values(table="eec_menage") from openfisca_core.simulations import SurveySimulation simulation = SurveySimulation() simulation.set_config(year=year) simulation.set_param() simulation.compute() # TODO: this should not be mandatory check_consistency(simulation.input_table, df)
def test(): yr = 2006 simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() filename = os.path.join(model.DATA_DIR, 'survey_psl.h5') simu.set_survey(filename=filename) simu.compute() df = get_structure(simu, 'br_al') print df.to_string()
def toto(): year = 2006 simulation = SurveySimulation() simulation.set_config(year=year) simulation.set_param() simulation.set_survey() simulation.compute() for name, col in simulation.output_table.column_by_name.iteritems(): print col.name print col._dtype print col.entity
def test(): for year in range(2006, 2010): yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() simu.set_survey() for var in ["f4ga", "f4gb", "f4gc", "f4ge", "f4gf", "f4gg"]: print var df = simu.survey.get_value(var) print df.max() print df.min()
def test(): country = "france" for year in range(2006, 2010): yr = str(year) simu = SurveySimulation() simu.set_config(year=yr, country=country) simu.set_param() agg = Aggregates() agg.set_simulation(simu) for col in agg.varlist: print col
def survey_case(year=2006): yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() # Ignore this # inflator = get_loyer_inflator(year) # simulation.inflate_survey({'loyer' : inflator}) simulation.compute() simul_out_df = simulation.output_table.table simul_in_df = simulation.input_table.table print simul_out_df.loc[:, ['af', 'af_base', 'af_forf', 'af_majo', 'af_nbenf' ]].describe() print 'input vars' print simul_in_df.columns print 'output vars' print simul_out_df.columns # check_inputs_enumcols(simulation) # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.columns print df1.to_string() # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by='so', vars=['nivvie']) print df2.to_string() return df1
def survey_case(year): yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() simulation.set_config(year=yr, num_table=1) simulation.set_param() simulation.compute() df = simulation.get_variables_dataframe(variables=["rsa_act"], entity='ind') print df["rsa_act"].describe() del simulation import gc gc.collect()
def _generate_aggregates(self): dfs = list() dfs_erf = list() years = self.years for year in years: # Running a standard SurveySimulation to get OF aggregates simulation = SurveySimulation() survey_filename = self.survey_filename simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems()) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [ label_by_name[variable] for variable in variables ] del simulation, agg, df # simulation.save_content(name, filename) gc.collect() # ERFS temp = (build_erf_aggregates(variables=variables, year=year)) temp.rename(columns=label_by_name, inplace=True) temp = temp.T temp.reset_index(inplace=True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() self.labels_variables = labels_variables self.aggregates_of_dataframe = dfs self.aggregates_erfs_dataframe = dfs_erf
def build_erf_aggregates(): """ Fetch the relevant aggregates from erf data """ # Uses rpy2. # On MS Windows, The environment variable R_HOME and R_USER should be set import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) country = 'france' for year in range(2006, 2008): menageXX = "menage" + str(year)[2:] menageRdata = menageXX + ".Rdata" filename = os.path.join(os.path.dirname(DATA_DIR), 'R', 'erf', str(year), menageRdata) yr = str(year) simu = SurveySimulation() simu.set_config(year=yr, country=country) simu.set_param() agg = Aggregates() agg.set_simulation(simu) # print agg.varlist rpy.r.load(filename) menage = com.load_data(menageXX) cols = [] print year for col in agg.varlist: #print col erf_var = "m_" + col + "m" if erf_var in menage.columns: cols += [erf_var] df = menage[cols] wprm = menage["wprm"] for col in df.columns: tot = (df[col] * wprm).sum() / 1e9 print col, tot
def test_gini(): """ Compute Gini coefficients """ years = range(2006, 2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() inflator = get_loyer_inflator(year) simu.inflate_survey({'loyer': inflator}) simu.compute() inequality = Inequality() inequality.set_simulation(simu) inequality.compute() print inequality.inequality_dataframe print inequality.poverty del simu
def survey_case_3_tables(): year = 2006 yr = str(year) simulation = SurveySimulation() survey_input = HDFStore(survey3_test) # convert_to_3_tables(year=year, survey_file=survey_file, output_file=survey3_file) df_men = survey_input['survey_2006/men'] df_foy = survey_input['survey_2006/foy'] df_fam = survey_input['survey_2006/fam'] df_fam['alr'] = 0 survey_input['survey_2006/fam'] = df_fam simulation.num_table = 3 simulation.set_config(year=yr, survey_filename=survey3_test) simulation.set_param() simulation.compute() # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.to_string() # # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by='so', vars=['nivvie']) print df2.to_string()
def build_aggregates3(): writer = None years = range(2006, 2007) tot1 = 0 tot3 = 0 for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time deb3 = time.clock() simu.set_survey(num_table=3) simu.compute() fin3 = time.clock() print "coucou" col = simu.survey.column_by_name.get("so") print col.entity agg3 = Aggregates3() agg3.set_simulation(simu) agg3.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True) for ent in ['ind', 'men', 'foy', 'fam']: dir_name = destination_dir + ent + '.csv' ## simu.survey.table3[ent].to_csv(dir_name) # import pdb # pdb.set_trace() ## com.convert_to_r_dataframe simu.output_table.table3[ent][:num_output].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1) print "prob compute" simu.compute() fin1 = time.clock() dir_name = destination_dir + 'en1' + '.csv' print "prob output" simu.output_table.table[:num_output].to_csv(dir_name) agg = Aggregates() print "prob set" agg.set_simulation(simu) print "prob compute" agg.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') print "prob ind" agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" + str(fin1 - deb1) print "Time to process 3 table :" + str(fin3 - deb3) print tot1, tot3, tot3 - tot1
def compar_num_table(): writer = None years = range(2006, 2007) tot1 = 0 tot3 = 0 filename = destination_dir + 'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time deb3 = time.clock() sous_ech = [ 6000080, 6000080, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000531, 6000542, 6000542 ] sous_ech = [ 6000191, 6000191, 6000531, 6000614, 6000195, 6000195, 6000499, 6000499, 6000531, 6000614, 6000531, 6000614, 6000531, 6000531, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000542, 6000542, 6000614, 6000191 ] #al sous_ech = [6000122, 6000865, 6001256] # typ_men sous_ech = [6006630, 6006753, 6008508] # foy sous_ech = [6036028, 6028397, 6019248] sous_ech = None simu.set_survey(num_table=3, subset=sous_ech) simu.compute() agg3 = Aggregates() for ent in ['ind', 'men', 'foy', 'fam']: tab = simu.output_table.table3[ent] renam = {} renam['wprm_' + ent] = 'wprm' tab = tab.rename(columns=renam) agg3.set_simulation(simu) agg3.compute() fin3 = time.clock() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True) # export to csv to run compar in R for ent in ['ind', 'men', 'foy', 'fam']: dir_name = destination_dir + ent + '.csv' tab = simu.output_table.table3[ent] renam = {} renam['wprm_' + ent] = 'wprm' if ent == 'ind': ident = [ "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy" ] else: ident = ["idmen", "idfam", "idfoy"] for nom in ident: renam[nom + '_' + ent] = nom tab = tab.rename(columns=renam) order_var = ident + list(tab.columns - ident) tab.sort(['idmen', 'idfam', 'idfoy']).ix[:num_output, order_var].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1, subset=sous_ech) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() fin1 = time.clock() # export to csv to run compar in R dir_name = destination_dir + 'en1' + '.csv' tab = simu.output_table.table tab.drop([ 'idfam_fam', 'idfam_foy', 'idfam_men', 'idfoy_fam', 'idfoy_foy', 'idfoy_men', 'idmen_men', 'idmen_fam', 'idmen_foy', 'wprm_foy', 'wprm_fam' ], axis=1, inplace=True) renam = {} ent = 'ind' renam['wprm_' + ent] = 'wprm' ident = [ "noi", "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy" ] for nom in ident: renam[nom + '_' + ent] = nom tab = tab.rename(columns=renam) order_var = ident + list(tab.columns - ident) tab.sort(['idmen', 'idfam', 'idfoy']).ix[:num_output, order_var].to_csv(dir_name) # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" + str(fin1 - deb1) print "Time to process 3 table :" + str(fin3 - deb3) print tot1, tot3, tot3 - tot1
def test(year=2006, variables=['af']): simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() # of_aggregates = Aggregates() # of_aggregates.set_simulation(simulation) # of_aggregates.compute() # print of_aggregates.aggr_frame # # from openfisca_france.data.erf.aggregates import build_erf_aggregates # temp = (build_erf_aggregates(variables=variables, year= year)) # print temp # return variable = "af" debugger = Debugger() debugger.set_simulation(simulation) debugger.set_variable(variable) debugger.show_aggregates() def get_all_ancestors(varlist): if len(varlist) == 0: return [] else: if varlist[0]._parents == set(): return ([varlist[0]] + get_all_ancestors(varlist[1:])) else: return ([varlist[0]] + get_all_ancestors(list(varlist[0]._parents)) + get_all_ancestors(varlist[1:])) # We want to get all ancestors + children + the options that we're going to encounter parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables) parents = get_all_ancestors(parents) options = [] for varcol in parents: options.extend(varcol._option.keys()) options = list(set(options)) #print options parents = map(lambda x: x.name, parents) for var in variables: children = set() varcol = simulation.output_table.column_by_name.get(var) children = children.union(set(map(lambda x: x.name, varcol._children))) variables = list(set(parents + list(children))) #print variables del parents, children gc.collect() def get_var(variable): variables = [variable] return simulation.aggregated_by_entity(entity="men", variables=variables, all_output_vars=False, force_sum=True)[0] simu_aggr_tables = get_var(variables[0]) for var in variables[1:]: simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on='idmen', how='outer') # We load the data from erf table in case we have to pick data there erf_data = DataCollection(year=year) os.system('cls') todo = set(variables + ["ident", "wprm"]).union(set(options)) print 'Variables or equivalents to fetch :' print todo ''' Méthode générale pour aller chercher les variables de l'erf/eec ( qui n'ont pas forcément le même nom et parfois sont les variables utilisées pour créér l'of ): 1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats que l'on compare ) Si les variables ne sont pas directement dans la table, elles ont été calculées à partir d'autres variables de données erf/eec donc chercher dans : 2 - build_survey 3 - model/model.py qui dira éventuellement dans quel module de model/ chercher Le 'print todo' vous indique quelles variables chercher ( attention à ne pas inclure les enfants directs ) L'utilisation du Ctrl-H est profitable ! ''' fetch_eec = [ 'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia', 'noindiv' ] fetch_erf = ['zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic'] erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi") eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi") erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner') assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns" del eec_df, erf_df # We then get the aggregate variables for the menage ( mainly to compare with of ) print 'Loading data from erf_menage table' erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'], table="erf_menage") del todo gc.collect() assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns" from openfisca_france.data.erf import get_erf2of erf2of = get_erf2of() erf_menage.rename(columns=erf2of, inplace=True) # We get the options from the simulation non aggregated tables: # First from the output_table # We recreate the noindiv in output_table simulation.output_table.table[ 'noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind simulation.output_table.table['noindiv'] = simulation.output_table.table[ 'noindiv'].astype(np.int64) s1 = [ var for var in set(options).intersection( set(simulation.output_table.table.columns)) ] + ['idmen_ind', 'quimen_ind', 'noindiv'] simu_nonaggr_tables = (simulation.output_table.table)[s1] simu_nonaggr_tables.rename(columns={ 'idmen_ind': 'idmen', 'quimen_ind': 'quimen' }, inplace=True) assert 'noindiv' in simu_nonaggr_tables.columns # If not found, we dwelve into the input_table if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options): assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns" s2 = [ var for var in (set(options).intersection( set(simulation.input_table.table.columns)) - set(s1)) ] + ['noindiv'] #print s2 temp = simulation.input_table.table[s2] simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on='noindiv', how='inner', sort=False) del s2, temp del s1 gc.collect() simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen', 'noindiv']] #print options, variables assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns' # Check the idmens that are not common erf_menage.rename(columns={'ident': 'idmen'}, inplace=True) print "\n" print 'Checking if idmen is here...' print '\n ERF : ' print 'idmen' in erf_menage.columns print "\n Simulation output" print 'idmen' in simu_aggr_tables.columns print "\n" #print 'Dropping duplicates of idmen for both tables...' assert not erf_menage["idmen"].duplicated().any( ), "Duplicated idmen in erf_menage" #erf_menage.drop_duplicates('idmen', inplace = True) simu_aggr_tables.drop_duplicates('idmen', inplace=True) assert not simu_aggr_tables["idmen"].duplicated().any( ), "Duplicated idmen in of" print 'Checking mismatching idmen... ' s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen'])) if s1: print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1)) pass s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen']) if s2: print "idmen that aren't in erf_menage : %s" % str(len(s2)) pass del s1, s2 # Restrict to common idmens and merge s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen'])) print "Restricting to %s common idmen... \n" % str(len(s3)) erf_menage = erf_menage[erf_menage['idmen'].isin(s3)] simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)] del s3 gc.collect() #print erf_menage.columns #print simu_aggr_tables.columns # Compare differences across of and erf dataframes print "Comparing differences between dataframes... \n" colcom = (set(erf_menage.columns).intersection( set(simu_aggr_tables.columns))) - set(['idmen', 'wprm']) print 'Common variables: ' print colcom erf_menage.reset_index(inplace=True) simu_aggr_tables.reset_index(inplace=True) for col in colcom: temp = set( erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]]) print "Numbers of idmen that aren't equal on variable %s : %s \n" % ( col, str(len(temp))) del temp # Detect the biggest differences bigtable = merge(erf_menage, simu_aggr_tables, on='idmen', how='inner', suffixes=('_erf', '_of')) print 'Length of new dataframe is %s' % str(len(bigtable)) #print bigtable.columns bigtable.set_index('idmen', drop=False, inplace=True) already_met = [] options_met = [] for col in colcom: bigtemp = None table = bigtable[and_(bigtable[col + '_erf'] != 0, bigtable[col + '_of'] != 0)] table[col] = (table[col + '_erf'] - table[col + '_of'] ) / table[col + '_erf'] #Difference relative table[col] = table[col].apply(lambda x: abs(x)) print 'Minimum difference between the two tables for %s is %s' % ( col, str(table[col].min())) print 'Maximum difference between the two tables for %s is %s' % ( col, str(table[col].max())) print table[col].describe() try: assert len(table[col]) == len(table['wprm_of']), "PINAGS" dec, values = mwp(table[col], np.arange(1, 11), table['wprm_of'], 2, return_quantiles=True) #print sorted(values) dec, values = mwp(table[col], np.arange(1, 101), table['wprm_erf'], 2, return_quantiles=True) #print sorted(values)[90:] del dec, values gc.collect() except: #print 'Weighted percentile method didnt work for %s' %col pass print "\n" # Show the relevant information for the most deviant households table.sort(columns=col, ascending=False, inplace=True) #print table[col][0:10].to_string() if bigtemp is None: bigtemp = { 'table': table[[col, col + '_of', col + '_erf', 'idmen']][0:10], 'options': None } bigtemp['table'][col + 'div'] = bigtemp['table'][ col + '_of'] / bigtemp['table'][col + '_erf'] print bigtemp['table'].to_string() ''' bigtemp is the table which will get filled little by little by the relevant variables. Up to the last rows of code 'table' refers to a table of aggregated values, while 'options is a table of individual variables. The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents', and it causes an error in Python unless for certain types like dictionnary values. ''' #print "\n" # If variable is a Prestation, we show the dependancies varcol = simulation.output_table.column_by_name.get(col) if isinstance(varcol, Prestation): ''' For the direct children ''' if not varcol._children is None: ch_to_fetch = list(varcol._children) ch_to_fetch = map(lambda x: x.name, ch_to_fetch) ch_fetched = [] if set(ch_to_fetch) <= set(simu_aggr_tables.columns): print "Variables which need %s to be computed :\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var + '_of' in table.columns: ch_fetched.append(var + '_of') else: ch_fetched.append(var) elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union( erf_menage.columns): print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var + '_erf' in table.columns: ch_fetched.append(var + '_erf') else: ch_fetched.append(var) else: print "Variables which need %s to be computed (some missing):\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: ch_fetched.append(var + '_erf') print table[[col] + ch_fetched][0:10] print "\n" del ch_to_fetch, ch_fetched ''' For the parents ''' def iter_on_parents(varcol): if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met: return else: par_to_fetch = list(varcol._parents) par_to_fetch = map(lambda x: x.name, par_to_fetch) par_fetched = [] if set(par_fetched) <= set(simu_aggr_tables.columns): #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched)) for var in par_fetched: if var + '_of' in table.columns: par_fetched.append(var + '_of') else: par_fetched.append(var) elif set(par_fetched) <= set( simu_aggr_tables.columns).union( erf_menage.columns): #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched)) for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var + '_erf' in table.columns: par_fetched.append(var + '_erf') else: par_fetched.append(var) else: for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: par_fetched.append(var + '_erf') if len(par_fetched) > 0: #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched)) pass else: #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched)) pass if len(par_fetched) > 0: temp = table[[col, 'idmen'] + par_fetched][0:10] bigtemp['table'] = pd.merge(temp, bigtemp['table'], how='inner') #print temp.to_string(), "\n" if varcol._option != {} and not set( varcol._option.keys()) < set(options_met): vars_to_fetch = list( set(varcol._option.keys()) - set(options_met)) #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys()) liste = [i for i in range(0, 10)] liste = map(lambda x: table['idmen'].iloc[x], liste) temp = simu_nonaggr_tables[ ['idmen', 'quimen', 'noindiv'] + vars_to_fetch][simu_nonaggr_tables['idmen'].isin( table['idmen'][0:10])] temp_sorted = temp[temp['idmen'] == liste[0]] for i in xrange(1, 10): temp_sorted = temp_sorted.append( temp[temp['idmen'] == liste[i]]) if bigtemp['options'] is None: bigtemp['options'] = temp_sorted bigtemp['options'] = bigtemp['options'].merge( erf_eec_indivi, on='noindiv', how='outer') else: bigtemp['options'] = bigtemp['options'].merge( temp_sorted, on=['noindiv', 'idmen', 'quimen'], how='outer') # temp_sorted.set_index(['idmen', 'quimen'], drop = True, inplace = True) # If we do that del temp, temp_sorted gc.collect() already_met.append(varcol.name) options_met.extend(varcol._option.keys()) for var in varcol._parents: iter_on_parents(var) iter_on_parents(varcol) # We merge the aggregate table with the option table ( for each individual in entity ) bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'], how='left', on='idmen', suffixes=('(agg)', '(ind)')) # Reshaping the table to group by descending error on col, common entities bigtemp['table'].sort(columns=['af', 'quimen'], ascending=[False, True], inplace=True) bigtemp['table'] = bigtemp['table'].groupby(['idmen', 'quimen'], sort=False).sum() print "Table of values for %s dependencies : \n" % col print bigtemp['table'].to_string() del bigtemp['table'], bigtemp['options'] gc.collect()
print "Table of values for %s dependencies : \n" % col print bigtemp['table'].to_string() del bigtemp['table'], bigtemp['options'] gc.collect() if __name__ == '__main__': restart = True survey = 'survey.h5' save_path = os.path.join(model.DATA_DIR, 'erf') saved_simulation_filename = os.path.join(save_path, 'debugger_' + survey[:-3]) if restart: year = 2006 simulation = SurveySimulation() if survey == 'survey.h5': survey_filename = os.path.join(model.DATA_DIR, survey) else: survey_filename = os.path.join(model.DATA_DIR, 'sources', survey) simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() simulation.save_content('debug', saved_simulation_filename) deb = Debugger() deb.set_simulation(name='debug', filename=saved_simulation_filename) deb.set_variable('af') deb.show_aggregates() deb.preproc()
def check_survey(year=2013): simulation = SurveySimulation() simulation.set_config(year=year) simulation.set_param() simulation.compute()
def test_laurence(): ''' Computes the openfisca/real numbers comparaison table in excel worksheet. Warning: To add more years you'll have to twitch the code manually. Default is years 2006 to 2009 included. ''' def save_as_xls(df, alter_method=True): # Saves a datatable under Excel table using XLtable if alter_method: filename = "C:\desindexation.xls" print filename writer = ExcelWriter(str(filename)) df.to_excel(writer) writer.save() else: # XLtable utile pour la mise en couleurs, reliefs, etc. de la table, inutile sinon stxl = XLtable(df) # <========== HERE TO CHANGE OVERLAY ======> wb = xlwt.Workbook() ws = wb.add_sheet('resultatstest') erfxcel = stxl.place_table(ws) try: # I dunno more clever commands wb.save("C:\outputtest.xls") except: n = random.randint(0, 100) wb.save("C:\outputtest_" + str(n) + ".xls") #=============================================================================== # from numpy.random import randn # mesures = ['cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add'] # sources = ['of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel'] # year = ['2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', # '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', # '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008'] # ind = zip(*[mesures,sources, year]) # # print ind # from pandas.core.index import MultiIndex # ind = MultiIndex.from_tuples(ind, names = ['mesure', 'source', 'year']) # # print ind # d = pd.DataFrame(randn(27,2), columns = ['Depenses', 'Recettes'], index = ind) # d.reset_index(inplace = True, drop = False) # d = d.groupby(by = ['mesure', 'source', 'year'], sort = False).sum() # print d # d_unstacked = d.unstack() # print d # indtemp1 = d.index.get_level_values(0) # indtemp2 = d.index.get_level_values(1) # indexi = zip(*[indtemp1, indtemp2]) # print indexi # indexi_bis = [] # for i in xrange(len(indexi)): # if indexi[i] not in indexi_bis: # indexi_bis.append(indexi[i]) # indexi = indexi_bis # indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source']) # print indexi # d_unstacked = d_unstacked.reindex_axis(indexi, axis = 0) # print d_unstacked.to_string() # save_as_xls(d_unstacked) # return #=============================================================================== def reshape_tables(dfs, dfs_erf): agg = Aggregates() # We need this for the columns labels to work print 'Resetting index to avoid later trouble on manipulation' for d in dfs: d.reset_index(inplace=True) d.set_index('Mesure', inplace=True, drop=False) d.reindex_axis(labels_variables, axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() for d in dfs_erf: d.reset_index(inplace=True) d['Mesure'] = agg.labels['dep'] d.set_index('index', inplace=True, drop=False) d.reindex_axis(agg.labels.values(), axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() # Concatening the openfisca tables for =/= years temp = pd.concat([dfs[0], dfs[1]], ignore_index=True) temp = pd.concat([temp, dfs[2]], ignore_index=True) temp = pd.concat([temp, dfs[3]], ignore_index=True) del temp[agg.labels['entity']], temp['index'] gc.collect() print 'We split the real aggregates from the of table' temp2 = temp[[ agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year' ]] del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']] temp['source'] = 'of' temp2['source'] = 'reel' temp2.rename(columns={ agg.labels['benef_real']: agg.labels['benef'], agg.labels['dep_real']: agg.labels['dep'] }, inplace=True) temp = pd.concat([temp, temp2], ignore_index=True) print 'We add the erf data to the table' for df in dfs_erf: del df['level_0'], df['Mesure'] df.rename(columns={ 'index': agg.labels['var'], 1: agg.labels['dep'] }, inplace=True) temp3 = pd.concat([dfs_erf[0], dfs_erf[1]], ignore_index=True) temp3 = pd.concat([temp3, dfs_erf[2]], ignore_index=True) temp3 = pd.concat([temp3, dfs_erf[3]], ignore_index=True) temp3['source'] = 'erfs' gc.collect() temp = pd.concat([temp, temp3], ignore_index=True) # print temp.to_string() print 'Index manipulation to reshape the output' temp.reset_index(drop=True, inplace=True) # We set the new index # temp.set_index('Mesure', drop = True, inplace = True) # temp.set_index('source', drop = True, append = True, inplace = True) # temp.set_index('year', drop = False, append = True, inplace = True) temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum() # Tricky, the [mesure, source, year] index is unique so sum() will return the only value # Groupby automatically deleted the source, mesure... columns and added them to index assert (isinstance(temp, pd.DataFrame)) # print temp.to_string() # We want the years to be in columns, so we use unstack temp_unstacked = temp.unstack() # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table : ## Reindexing rows from pandas.core.index import MultiIndex indtemp1 = temp.index.get_level_values(0) indtemp2 = temp.index.get_level_values(1) indexi = zip(*[indtemp1, indtemp2]) indexi_bis = [] for i in xrange(0, len(indexi)): if indexi[i] not in indexi_bis: indexi_bis.append(indexi[i]) indexi = indexi_bis del indexi_bis indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source']) # import pdb # pdb.set_trace() temp_unstacked = temp_unstacked.reindex_axis( indexi, axis=0) # axis = 0 for rows, 1 for columns ## Reindexing columns # TODO : still not working col_indexi = [] for col in temp.columns.get_level_values(0).unique(): for yr in range(2006, 2010): col_indexi.append((col, str(yr))) col_indexi = MultiIndex.from_tuples(col_indexi) # print col_indexi # print temp_unstacked.columns print col_indexi # temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis = 1) # Our table is ready to be turned to Excel worksheet ! print temp_unstacked.to_string() temp_unstacked.fillna(0, inplace=True) return temp_unstacked dfs = [] dfs_erf = [] for i in range(2006, 2010): year = i yr = str(i) # Running a standard SurveySim to get aggregates simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems()) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [label_by_name[variable] for variable in variables] del simulation, agg, df gc.collect() #Getting ERF aggregates from ERF table temp = build_erf_aggregates(variables=variables, year=year) temp.rename(columns=label_by_name, inplace=True) temp = temp.T temp.reset_index(inplace=True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() print 'Out of data fetching for year ' + str(year) print 'Out of data fetching' datatest = reshape_tables(dfs, dfs_erf) save_as_xls(datatest, alter_method=False)