def build_aggregates(): writer = None years = range(2006,2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() simu.set_survey() inflator = get_loyer_inflator(year) simu.inflate_survey({'loyer' : inflator}) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() if writer is None: writer = ExcelWriter(str(fname_all)) agg.aggr_frame.to_excel(writer, yr, index= False, header= True, float_format="%.2f") print agg.aggr_frame.to_string() del simu del agg import gc gc.collect() writer.save()
def build_aggregates(): # writer = None years = range(2009, 2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() simu.set_survey() simu.compute() variables = ["garext", "ci_garext", "inthab", "ppe_brute", "rni"] x = simu.aggregated_by_entity("men", variables, all_output_vars=False) # df = x[0] # print df["ci_garext"].describe() agg = Aggregates() agg.set_simulation(simu) agg.show_default = False agg.show_real = False agg.show_diff = False agg.varlist = var_list # ERROR: var_list is undefined. agg.compute() cols = agg.aggr_frame.columns[:4] print agg.aggr_frame[cols].to_string() # if writer is None: # writer = ExcelWriter(str(fname_all)) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) del simu del agg import gc gc.collect()
def test_chunk(): print "debut" writer = None years = range(2011,2012) filename = destination_dir+'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() import time tps = {} for nb_chunk in range(1,5): deb_chunk = time.clock() simu.set_config(survey_filename='C:\\Til\\output\\to_run_leg.h5', num_table=3, chunks_count=nb_chunk , print_missing=False) simu.compute() tps[nb_chunk] = time.clock() - deb_chunk voir = simu.output_table.table3['foy'] print len(voir) pdb.set_trace() agg3 = Aggregates() agg3.set_simulation(simu) agg3.compute() df1 = agg3.aggr_frame print df1.to_string() print tps store.close()
def survey_case(year): # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() simulation.set_config(year = year) simulation.set_param() # Ignore this # inflator = get_loyer_inflator(year) # simulation.inflate_survey({'loyer' : inflator}) simulation.compute() # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.to_string() # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by = 'so', vars = ['nivvie']) print df2.to_string()
def build_aggregates(): # writer = None years = range(2009,2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() simu.set_survey() simu.compute() variables = ["garext", "ci_garext", "inthab", "ppe_brute", "rni"] x = simu.aggregated_by_entity("men", variables, all_output_vars = False) # df = x[0] # print df["ci_garext"].describe() agg = Aggregates() agg.set_simulation(simu) agg.show_default = False agg.show_real = False agg.show_diff = False agg.varlist = var_list # ERROR: var_list is undefined. agg.compute() cols = agg.aggr_frame.columns[:4] print agg.aggr_frame[cols].to_string() # if writer is None: # writer = ExcelWriter(str(fname_all)) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) del simu del agg import gc gc.collect()
def survey_case(year = 2006): yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() # Ignore this # inflator = get_loyer_inflator(year) # simulation.inflate_survey({'loyer' : inflator}) simulation.compute() simul_out_df = simulation.output_table.table simul_in_df = simulation.input_table.table print simul_out_df.loc[:,['af', 'af_base', 'af_forf', 'af_majo', 'af_nbenf']].describe() print 'input vars' print simul_in_df.columns print 'output vars' print simul_out_df.columns # check_inputs_enumcols(simulation) # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.columns print df1.to_string() # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by ='so', vars=['nivvie']) print df2.to_string() return df1
def _generate_aggregates(self): dfs = list() dfs_erf = list() years = self.years for year in years: # Running a standard SurveySimulation to get OF aggregates simulation = SurveySimulation() survey_filename = self.survey_filename simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems() ) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [ label_by_name[variable] for variable in variables ] del simulation, agg, df # simulation.save_content(name, filename) gc.collect() # ERFS temp = (build_erf_aggregates(variables=variables, year= year)) temp.rename(columns = label_by_name, inplace = True) temp = temp.T temp.reset_index(inplace = True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() self.labels_variables = labels_variables self.aggregates_of_dataframe = dfs self.aggregates_erfs_dataframe = dfs_erf
def show_aggregates(self): from openfisca_france.data.erf.aggregates import build_erf_aggregates assert self.simulation is not None, 'simulation attribute is None' assert self.variable is not None, 'variable attribute is None' variable = self.variable of_aggregates = Aggregates() of_aggregates.set_simulation(self.simulation) of_aggregates.compute() temp = (build_erf_aggregates(variables=[variable], year= self.simulation.datesim.year)) selection = of_aggregates.aggr_frame["Mesure"] == self.simulation.io_column_by_name[variable].label print of_aggregates.aggr_frame[selection] print temp # TODO: clean this return
def survey_case_3_tables(): year = 2006 yr = str(year) simulation = SurveySimulation() survey_input = HDFStore(survey3_test) # convert_to_3_tables(year=year, survey_file=survey_file, output_file=survey3_file) df_men = survey_input['survey_2006/men'] df_foy = survey_input['survey_2006/foy'] df_fam = survey_input['survey_2006/fam'] df_fam['alr'] = 0; survey_input['survey_2006/fam'] = df_fam simulation.num_table = 3 simulation.set_config(year = yr, survey_filename=survey3_test) simulation.set_param() simulation.compute() # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.to_string() # # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by ='so', vars=['nivvie']) print df2.to_string()
def test_chunk(): print "debut" writer = None years = range(2011, 2012) filename = destination_dir + 'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time tps = {} for nb_chunk in range(1, 5): deb_chunk = time.clock() simu.set_config(survey_filename='C:\\Til\\output\\to_run_leg.h5', num_table=3, chunks_count=nb_chunk, print_missing=False) simu.compute() tps[nb_chunk] = time.clock() - deb_chunk voir = simu.output_table.table3['foy'] print len(voir) pdb.set_trace() agg3 = Aggregates() agg3.set_simulation(simu) agg3.compute() df1 = agg3.aggr_frame print df1.to_string() print tps store.close()
def build_aggregates(): writer = None years = range(2006, 2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() simu.set_survey() inflator = get_loyer_inflator(year) simu.inflate_survey({'loyer': inflator}) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() if writer is None: writer = ExcelWriter(str(fname_all)) agg.aggr_frame.to_excel(writer, yr, index=False, header=True, float_format="%.2f") print agg.aggr_frame.to_string() del simu del agg import gc gc.collect() writer.save()
def survey_case(year=2006): yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() # Ignore this # inflator = get_loyer_inflator(year) # simulation.inflate_survey({'loyer' : inflator}) simulation.compute() simul_out_df = simulation.output_table.table simul_in_df = simulation.input_table.table print simul_out_df.loc[:, ['af', 'af_base', 'af_forf', 'af_majo', 'af_nbenf' ]].describe() print 'input vars' print simul_in_df.columns print 'output vars' print simul_out_df.columns # check_inputs_enumcols(simulation) # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.columns print df1.to_string() # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by='so', vars=['nivvie']) print df2.to_string() return df1
def _generate_aggregates(self): dfs = list() dfs_erf = list() years = self.years for year in years: # Running a standard SurveySimulation to get OF aggregates simulation = SurveySimulation() survey_filename = self.survey_filename simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems()) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [ label_by_name[variable] for variable in variables ] del simulation, agg, df # simulation.save_content(name, filename) gc.collect() # ERFS temp = (build_erf_aggregates(variables=variables, year=year)) temp.rename(columns=label_by_name, inplace=True) temp = temp.T temp.reset_index(inplace=True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() self.labels_variables = labels_variables self.aggregates_of_dataframe = dfs self.aggregates_erfs_dataframe = dfs_erf
def show_aggregates(self): from openfisca_france.data.erf.aggregates import build_erf_aggregates assert self.simulation is not None, 'simulation attribute is None' assert self.variable is not None, 'variable attribute is None' variable = self.variable of_aggregates = Aggregates() of_aggregates.set_simulation(self.simulation) of_aggregates.compute() temp = (build_erf_aggregates(variables=[variable], year=self.simulation.datesim.year)) selection = of_aggregates.aggr_frame[ "Mesure"] == self.simulation.io_column_by_name[variable].label print of_aggregates.aggr_frame[selection] print temp # TODO: clean this return
def survey_case_3_tables(): year = 2006 yr = str(year) simulation = SurveySimulation() survey_input = HDFStore(survey3_test) # convert_to_3_tables(year=year, survey_file=survey_file, output_file=survey3_file) df_men = survey_input['survey_2006/men'] df_foy = survey_input['survey_2006/foy'] df_fam = survey_input['survey_2006/fam'] df_fam['alr'] = 0 survey_input['survey_2006/fam'] = df_fam simulation.num_table = 3 simulation.set_config(year=yr, survey_filename=survey3_test) simulation.set_param() simulation.compute() # Compute aggregates agg = Aggregates() agg.set_simulation(simulation) agg.compute() df1 = agg.aggr_frame print df1.to_string() # # Saving aggregates # if writer is None: # writer = ExcelWriter(str(fname) # agg.aggr_frame.to_excel(writer, yr, index= False, header= True) # Displaying a pivot table from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable pivot_table = OpenfiscaPivotTable() pivot_table.set_simulation(simulation) df2 = pivot_table.get_table(by='so', vars=['nivvie']) print df2.to_string()
def test_laurence(): ''' Computes the openfisca/real numbers comparaison table in excel worksheet. Warning: To add more years you'll have to twitch the code manually. Default is years 2006 to 2009 included. ''' def save_as_xls(df, alter_method = True): # Saves a datatable under Excel table using XLtable if alter_method: filename = "C:\desindexation.xls" print filename writer = ExcelWriter(str(filename)) df.to_excel(writer) writer.save() else: # XLtable utile pour la mise en couleurs, reliefs, etc. de la table, inutile sinon stxl = XLtable(df) # <========== HERE TO CHANGE OVERLAY ======> wb = xlwt.Workbook() ws = wb.add_sheet('resultatstest') erfxcel = stxl.place_table(ws) try: # I dunno more clever commands wb.save("C:\outputtest.xls") except: n = random.randint(0,100) wb.save("C:\outputtest_"+str(n)+".xls") #=============================================================================== # from numpy.random import randn # mesures = ['cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add'] # sources = ['of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel'] # year = ['2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', # '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', # '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008'] # ind = zip(*[mesures,sources, year]) # # print ind # from pandas.core.index import MultiIndex # ind = MultiIndex.from_tuples(ind, names = ['mesure', 'source', 'year']) # # print ind # d = pd.DataFrame(randn(27,2), columns = ['Depenses', 'Recettes'], index = ind) # d.reset_index(inplace = True, drop = False) # d = d.groupby(by = ['mesure', 'source', 'year'], sort = False).sum() # print d # d_unstacked = d.unstack() # print d # indtemp1 = d.index.get_level_values(0) # indtemp2 = d.index.get_level_values(1) # indexi = zip(*[indtemp1, indtemp2]) # print indexi # indexi_bis = [] # for i in xrange(len(indexi)): # if indexi[i] not in indexi_bis: # indexi_bis.append(indexi[i]) # indexi = indexi_bis # indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source']) # print indexi # d_unstacked = d_unstacked.reindex_axis(indexi, axis = 0) # print d_unstacked.to_string() # save_as_xls(d_unstacked) # return #=============================================================================== def reshape_tables(dfs, dfs_erf): agg = Aggregates() # We need this for the columns labels to work print 'Resetting index to avoid later trouble on manipulation' for d in dfs: d.reset_index(inplace = True) d.set_index('Mesure', inplace = True, drop = False) d.reindex_axis(labels_variables, axis = 0) d.reset_index(inplace = True, drop = True) # print d.to_string() for d in dfs_erf: d.reset_index(inplace = True) d['Mesure'] = agg.labels['dep'] d.set_index('index', inplace = True, drop = False) d.reindex_axis(agg.labels.values(), axis = 0) d.reset_index(inplace = True, drop = True) # print d.to_string() # Concatening the openfisca tables for =/= years temp = pd.concat([dfs[0],dfs[1]], ignore_index = True) temp = pd.concat([temp,dfs[2]], ignore_index = True) temp = pd.concat([temp,dfs[3]], ignore_index = True) del temp[agg.labels['entity']], temp['index'] gc.collect() print 'We split the real aggregates from the of table' temp2 = temp[[agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year']] del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']] temp['source'] = 'of' temp2['source'] = 'reel' temp2.rename(columns = {agg.labels['benef_real'] : agg.labels['benef'], agg.labels['dep_real'] : agg.labels['dep']}, inplace = True) temp = pd.concat([temp,temp2], ignore_index = True) print 'We add the erf data to the table' for df in dfs_erf: del df['level_0'], df['Mesure'] df.rename(columns = {'index' : agg.labels['var'], 1 : agg.labels['dep']}, inplace = True) temp3 = pd.concat([dfs_erf[0], dfs_erf[1]], ignore_index = True) temp3 = pd.concat([temp3, dfs_erf[2]], ignore_index = True) temp3 = pd.concat([temp3, dfs_erf[3]], ignore_index = True) temp3['source'] = 'erfs' gc.collect() temp = pd.concat([temp, temp3], ignore_index = True) # print temp.to_string() print 'Index manipulation to reshape the output' temp.reset_index(drop = True, inplace = True) # We set the new index # temp.set_index('Mesure', drop = True, inplace = True) # temp.set_index('source', drop = True, append = True, inplace = True) # temp.set_index('year', drop = False, append = True, inplace = True) temp = temp.groupby(by=["Mesure", "source", "year"], sort = False).sum() # Tricky, the [mesure, source, year] index is unique so sum() will return the only value # Groupby automatically deleted the source, mesure... columns and added them to index assert(isinstance(temp, pd.DataFrame)) # print temp.to_string() # We want the years to be in columns, so we use unstack temp_unstacked = temp.unstack() # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table : ## Reindexing rows from pandas.core.index import MultiIndex indtemp1 = temp.index.get_level_values(0) indtemp2 = temp.index.get_level_values(1) indexi = zip(*[indtemp1, indtemp2]) indexi_bis = [] for i in xrange(0,len(indexi)): if indexi[i] not in indexi_bis: indexi_bis.append(indexi[i]) indexi = indexi_bis del indexi_bis indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source']) # import pdb # pdb.set_trace() temp_unstacked = temp_unstacked.reindex_axis(indexi, axis = 0) # axis = 0 for rows, 1 for columns ## Reindexing columns # TODO : still not working col_indexi = [] for col in temp.columns.get_level_values(0).unique(): for yr in range(2006,2010): col_indexi.append((col, str(yr))) col_indexi = MultiIndex.from_tuples(col_indexi) # print col_indexi # print temp_unstacked.columns print col_indexi # temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis = 1) # Our table is ready to be turned to Excel worksheet ! print temp_unstacked.to_string() temp_unstacked.fillna(0, inplace = True) return temp_unstacked dfs = [] dfs_erf = [] for i in range(2006,2010): year = i yr = str(i) # Running a standard SurveySim to get aggregates simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems() ) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [ label_by_name[variable] for variable in variables ] del simulation, agg, df gc.collect() #Getting ERF aggregates from ERF table temp = build_erf_aggregates(variables=variables, year= year) temp.rename(columns = label_by_name, inplace = True) temp = temp.T temp.reset_index(inplace = True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() print 'Out of data fetching for year ' + str(year) print 'Out of data fetching' datatest = reshape_tables(dfs, dfs_erf) save_as_xls(datatest, alter_method = False)
def build_aggregates3(): writer = None years = range(2006,2007) tot1 = 0 tot3 = 0 for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() import time deb3 = time.clock() simu.set_survey(num_table=3) simu.compute() fin3 = time.clock() print "coucou" col = simu.survey.column_by_name.get("so") print col.entity agg3 = Aggregates3() agg3.set_simulation(simu) agg3.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index= False, header= True) for ent in ['ind','men','foy','fam']: dir_name = destination_dir + ent +'.csv' ## simu.survey.table3[ent].to_csv(dir_name) # import pdb # pdb.set_trace() ## com.convert_to_r_dataframe simu.output_table.table3[ent][:num_output].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1) print "prob compute" simu.compute() fin1 = time.clock() dir_name = destination_dir + 'en1' +'.csv' print "prob output" simu.output_table.table[:num_output].to_csv(dir_name) agg = Aggregates() print "prob set" agg.set_simulation(simu) print "prob compute" agg.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') print "prob ind" agg.aggr_frame.to_excel(fname_all, yr, index= False, header= True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" +str(fin1 - deb1) print "Time to process 3 table :" +str(fin3 - deb3) print tot1, tot3, tot3- tot1
def build_aggregates3(): writer = None years = range(2006, 2007) tot1 = 0 tot3 = 0 for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time deb3 = time.clock() simu.set_survey(num_table=3) simu.compute() fin3 = time.clock() print "coucou" col = simu.survey.column_by_name.get("so") print col.entity agg3 = Aggregates3() agg3.set_simulation(simu) agg3.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True) for ent in ['ind', 'men', 'foy', 'fam']: dir_name = destination_dir + ent + '.csv' ## simu.survey.table3[ent].to_csv(dir_name) # import pdb # pdb.set_trace() ## com.convert_to_r_dataframe simu.output_table.table3[ent][:num_output].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1) print "prob compute" simu.compute() fin1 = time.clock() dir_name = destination_dir + 'en1' + '.csv' print "prob output" simu.output_table.table[:num_output].to_csv(dir_name) agg = Aggregates() print "prob set" agg.set_simulation(simu) print "prob compute" agg.compute() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') print "prob ind" agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" + str(fin1 - deb1) print "Time to process 3 table :" + str(fin3 - deb3) print tot1, tot3, tot3 - tot1
def compar_num_table(): writer = None years = range(2006, 2007) tot1 = 0 tot3 = 0 filename = destination_dir + 'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year=yr) simu.set_param() import time deb3 = time.clock() sous_ech = [ 6000080, 6000080, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000531, 6000542, 6000542 ] sous_ech = [ 6000191, 6000191, 6000531, 6000614, 6000195, 6000195, 6000499, 6000499, 6000531, 6000614, 6000531, 6000614, 6000531, 6000531, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000542, 6000542, 6000614, 6000191 ] #al sous_ech = [6000122, 6000865, 6001256] # typ_men sous_ech = [6006630, 6006753, 6008508] # foy sous_ech = [6036028, 6028397, 6019248] sous_ech = None simu.set_survey(num_table=3, subset=sous_ech) simu.compute() agg3 = Aggregates() for ent in ['ind', 'men', 'foy', 'fam']: tab = simu.output_table.table3[ent] renam = {} renam['wprm_' + ent] = 'wprm' tab = tab.rename(columns=renam) agg3.set_simulation(simu) agg3.compute() fin3 = time.clock() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True) # export to csv to run compar in R for ent in ['ind', 'men', 'foy', 'fam']: dir_name = destination_dir + ent + '.csv' tab = simu.output_table.table3[ent] renam = {} renam['wprm_' + ent] = 'wprm' if ent == 'ind': ident = [ "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy" ] else: ident = ["idmen", "idfam", "idfoy"] for nom in ident: renam[nom + '_' + ent] = nom tab = tab.rename(columns=renam) order_var = ident + list(tab.columns - ident) tab.sort(['idmen', 'idfam', 'idfoy']).ix[:num_output, order_var].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1, subset=sous_ech) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() fin1 = time.clock() # export to csv to run compar in R dir_name = destination_dir + 'en1' + '.csv' tab = simu.output_table.table tab.drop([ 'idfam_fam', 'idfam_foy', 'idfam_men', 'idfoy_fam', 'idfoy_foy', 'idfoy_men', 'idmen_men', 'idmen_fam', 'idmen_foy', 'wprm_foy', 'wprm_fam' ], axis=1, inplace=True) renam = {} ent = 'ind' renam['wprm_' + ent] = 'wprm' ident = [ "noi", "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy" ] for nom in ident: renam[nom + '_' + ent] = nom tab = tab.rename(columns=renam) order_var = ident + list(tab.columns - ident) tab.sort(['idmen', 'idfam', 'idfoy']).ix[:num_output, order_var].to_csv(dir_name) # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" + str(fin1 - deb1) print "Time to process 3 table :" + str(fin3 - deb3) print tot1, tot3, tot3 - tot1
def compar_num_table(): writer = None years = range(2006,2007) tot1 = 0 tot3 = 0 filename = destination_dir+'output3.h5' store = HDFStore(filename) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() import time deb3 = time.clock() sous_ech = [6000080, 6000080, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000531, 6000542, 6000542] sous_ech = [6000191, 6000191, 6000531, 6000614, 6000195, 6000195, 6000499, 6000499, 6000531, 6000614, 6000531, 6000614, 6000531, 6000531, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000542, 6000542, 6000614, 6000191] #al sous_ech = [6000122, 6000865, 6001256] # typ_men sous_ech = [6006630, 6006753, 6008508] # foy sous_ech = [6036028, 6028397, 6019248] sous_ech = None simu.set_survey(num_table=3, subset=sous_ech) simu.compute() agg3 = Aggregates() for ent in ['ind','men','foy','fam']: tab = simu.output_table.table3[ent] renam={} renam['wprm_'+ent] = 'wprm' tab = tab.rename(columns=renam) agg3.set_simulation(simu) agg3.compute() fin3 = time.clock() # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg3.xlsx') agg3.aggr_frame.to_excel(fname_all, yr, index= False, header= True) # export to csv to run compar in R for ent in ['ind','men','foy','fam']: dir_name = destination_dir + ent +'.csv' tab = simu.output_table.table3[ent] renam ={} renam['wprm_'+ent] = 'wprm' if ent=='ind': ident = ["idmen","quimen","idfam","quifam","idfoy","quifoy"] else: ident = ["idmen","idfam","idfoy"] for nom in ident: renam[nom+'_'+ent] = nom tab = tab.rename(columns=renam) order_var = ident+list(tab.columns - ident) tab.sort(['idmen','idfam','idfoy']).ix[:num_output,order_var].to_csv(dir_name) deb1 = time.clock() simu.set_survey(num_table=1, subset=sous_ech) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() fin1 = time.clock() # export to csv to run compar in R dir_name = destination_dir + 'en1' +'.csv' tab = simu.output_table.table tab.drop(['idfam_fam','idfam_foy','idfam_men','idfoy_fam','idfoy_foy','idfoy_men','idmen_men','idmen_fam','idmen_foy','wprm_foy','wprm_fam'], axis=1, inplace=True) renam ={} ent = 'ind' renam['wprm_'+ent] = 'wprm' ident = ["noi","idmen","quimen","idfam","quifam","idfoy","quifoy"] for nom in ident: renam[nom+'_'+ent] = nom tab = tab.rename(columns=renam) order_var = ident+list(tab.columns - ident) tab.sort(['idmen','idfam','idfoy']).ix[:num_output,order_var].to_csv(dir_name) # if writer is None: # writer = ExcelWriter(str(fname_all)) fname_all = os.path.join(destination_dir, 'agg1.xlsx') agg.aggr_frame.to_excel(fname_all, yr, index= False, header= True) del simu del agg import gc gc.collect() tot1 += fin1 - deb1 tot3 += fin3 - deb3 print "Time to process 1 table :" +str(fin1 - deb1) print "Time to process 3 table :" +str(fin3 - deb3) print tot1, tot3, tot3- tot1
def _reshape_tables(self): """ TODO _reshape_tables should be cleaned !!! """ dfs = self.aggregates_of_dataframe dfs_erf = self.aggregates_erfs_dataframe labels_variables = self.labels_variables agg = Aggregates() # We need this for the columns labels to work print 'Resetting index to avoid later trouble on manipulation' for d in dfs: d.reset_index(inplace=True) d.set_index('Mesure', inplace=True, drop=False) d.reindex_axis(labels_variables, axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() for d in dfs_erf: d.reset_index(inplace=True) d['Mesure'] = agg.labels['dep'] d.set_index('index', inplace=True, drop=False) d.reindex_axis(agg.labels.values(), axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() # Concatening the openfisca tables for =/= years temp = dfs[0] if len(dfs) != 1: for d in dfs[1:]: temp = pd.concat([temp, d], ignore_index=True) del temp[agg.labels['entity']], temp['index'] gc.collect() print 'We split the real aggregates from the of table' temp2 = temp[[ agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year' ]] del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']] sauvegarde = temp.columns.get_level_values(0).unique() temp['source'] = 'of' temp2['source'] = 'reel' temp2.rename(columns={ agg.labels['benef_real']: agg.labels['benef'], agg.labels['dep_real']: agg.labels['dep'] }, inplace=True) temp = pd.concat([temp, temp2], ignore_index=True) print 'We add the erf data to the table' for df in dfs_erf: del df['level_0'], df['Mesure'] df.rename(columns={ 'index': agg.labels['var'], 1: agg.labels['dep'] }, inplace=True) temp3 = dfs_erf[0] if len(dfs) != 1: for d3 in dfs_erf[1:]: temp3 = pd.concat([temp3, d3], ignore_index=True) temp3['source'] = 'erfs' gc.collect() temp = pd.concat([temp, temp3], ignore_index=True) # print temp.to_string() print 'Index manipulation to reshape the output' temp.reset_index(drop=True, inplace=True) # We set the new index # temp.set_index('Mesure', drop = True, inplace = True) # temp.set_index('source', drop = True, append = True, inplace = True) # temp.set_index('year', drop = False, append = True, inplace = True) temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum() # Tricky, the [mesure, source, year] index is unique so sum() will return the only value # Groupby automatically deleted the source, mesure... columns and added them to index assert (isinstance(temp, pd.DataFrame)) # print temp.to_string() # We want the years to be in columns, so we use unstack temp_unstacked = temp.unstack() # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table : ## Reindexing rows from pandas.core.index import MultiIndex indtemp1 = temp.index.get_level_values(0) indtemp2 = temp.index.get_level_values(1) indexi = zip(*[indtemp1, indtemp2]) indexi_bis = [] for i in xrange(0, len(indexi)): if indexi[i] not in indexi_bis: indexi_bis.append(indexi[i]) indexi = indexi_bis del indexi_bis indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source']) # import pdb # pdb.set_trace() temp_unstacked = temp_unstacked.reindex_axis( indexi, axis=0) # axis = 0 for rows, 1 for columns ## Reindexing columns # TODO : still not working col_indexi = [] print temp.columns for i in xrange(len(sauvegarde)): # for col in temp.columns.get_level_values(0).unique(): col = sauvegarde[i] for yr in self.years: col_indexi.append((col, yr)) col_indexi = MultiIndex.from_tuples(col_indexi) # print col_indexi # print temp_unstacked.columns print col_indexi print temp_unstacked.columns temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis=1) # Our table is ready to be turned to Excel worksheet ! # print temp_unstacked.to_string() del temp_unstacked['Mesure'], temp_unstacked['year'] temp_unstacked.fillna(0, inplace=True) return temp_unstacked
def test_laurence(): ''' Computes the openfisca/real numbers comparaison table in excel worksheet. Warning: To add more years you'll have to twitch the code manually. Default is years 2006 to 2009 included. ''' def save_as_xls(df, alter_method=True): # Saves a datatable under Excel table using XLtable if alter_method: filename = "C:\desindexation.xls" print filename writer = ExcelWriter(str(filename)) df.to_excel(writer) writer.save() else: # XLtable utile pour la mise en couleurs, reliefs, etc. de la table, inutile sinon stxl = XLtable(df) # <========== HERE TO CHANGE OVERLAY ======> wb = xlwt.Workbook() ws = wb.add_sheet('resultatstest') erfxcel = stxl.place_table(ws) try: # I dunno more clever commands wb.save("C:\outputtest.xls") except: n = random.randint(0, 100) wb.save("C:\outputtest_" + str(n) + ".xls") #=============================================================================== # from numpy.random import randn # mesures = ['cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add', # 'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add'] # sources = ['of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel', # 'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel'] # year = ['2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', # '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', # '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008'] # ind = zip(*[mesures,sources, year]) # # print ind # from pandas.core.index import MultiIndex # ind = MultiIndex.from_tuples(ind, names = ['mesure', 'source', 'year']) # # print ind # d = pd.DataFrame(randn(27,2), columns = ['Depenses', 'Recettes'], index = ind) # d.reset_index(inplace = True, drop = False) # d = d.groupby(by = ['mesure', 'source', 'year'], sort = False).sum() # print d # d_unstacked = d.unstack() # print d # indtemp1 = d.index.get_level_values(0) # indtemp2 = d.index.get_level_values(1) # indexi = zip(*[indtemp1, indtemp2]) # print indexi # indexi_bis = [] # for i in xrange(len(indexi)): # if indexi[i] not in indexi_bis: # indexi_bis.append(indexi[i]) # indexi = indexi_bis # indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source']) # print indexi # d_unstacked = d_unstacked.reindex_axis(indexi, axis = 0) # print d_unstacked.to_string() # save_as_xls(d_unstacked) # return #=============================================================================== def reshape_tables(dfs, dfs_erf): agg = Aggregates() # We need this for the columns labels to work print 'Resetting index to avoid later trouble on manipulation' for d in dfs: d.reset_index(inplace=True) d.set_index('Mesure', inplace=True, drop=False) d.reindex_axis(labels_variables, axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() for d in dfs_erf: d.reset_index(inplace=True) d['Mesure'] = agg.labels['dep'] d.set_index('index', inplace=True, drop=False) d.reindex_axis(agg.labels.values(), axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() # Concatening the openfisca tables for =/= years temp = pd.concat([dfs[0], dfs[1]], ignore_index=True) temp = pd.concat([temp, dfs[2]], ignore_index=True) temp = pd.concat([temp, dfs[3]], ignore_index=True) del temp[agg.labels['entity']], temp['index'] gc.collect() print 'We split the real aggregates from the of table' temp2 = temp[[ agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year' ]] del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']] temp['source'] = 'of' temp2['source'] = 'reel' temp2.rename(columns={ agg.labels['benef_real']: agg.labels['benef'], agg.labels['dep_real']: agg.labels['dep'] }, inplace=True) temp = pd.concat([temp, temp2], ignore_index=True) print 'We add the erf data to the table' for df in dfs_erf: del df['level_0'], df['Mesure'] df.rename(columns={ 'index': agg.labels['var'], 1: agg.labels['dep'] }, inplace=True) temp3 = pd.concat([dfs_erf[0], dfs_erf[1]], ignore_index=True) temp3 = pd.concat([temp3, dfs_erf[2]], ignore_index=True) temp3 = pd.concat([temp3, dfs_erf[3]], ignore_index=True) temp3['source'] = 'erfs' gc.collect() temp = pd.concat([temp, temp3], ignore_index=True) # print temp.to_string() print 'Index manipulation to reshape the output' temp.reset_index(drop=True, inplace=True) # We set the new index # temp.set_index('Mesure', drop = True, inplace = True) # temp.set_index('source', drop = True, append = True, inplace = True) # temp.set_index('year', drop = False, append = True, inplace = True) temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum() # Tricky, the [mesure, source, year] index is unique so sum() will return the only value # Groupby automatically deleted the source, mesure... columns and added them to index assert (isinstance(temp, pd.DataFrame)) # print temp.to_string() # We want the years to be in columns, so we use unstack temp_unstacked = temp.unstack() # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table : ## Reindexing rows from pandas.core.index import MultiIndex indtemp1 = temp.index.get_level_values(0) indtemp2 = temp.index.get_level_values(1) indexi = zip(*[indtemp1, indtemp2]) indexi_bis = [] for i in xrange(0, len(indexi)): if indexi[i] not in indexi_bis: indexi_bis.append(indexi[i]) indexi = indexi_bis del indexi_bis indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source']) # import pdb # pdb.set_trace() temp_unstacked = temp_unstacked.reindex_axis( indexi, axis=0) # axis = 0 for rows, 1 for columns ## Reindexing columns # TODO : still not working col_indexi = [] for col in temp.columns.get_level_values(0).unique(): for yr in range(2006, 2010): col_indexi.append((col, str(yr))) col_indexi = MultiIndex.from_tuples(col_indexi) # print col_indexi # print temp_unstacked.columns print col_indexi # temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis = 1) # Our table is ready to be turned to Excel worksheet ! print temp_unstacked.to_string() temp_unstacked.fillna(0, inplace=True) return temp_unstacked dfs = [] dfs_erf = [] for i in range(2006, 2010): year = i yr = str(i) # Running a standard SurveySim to get aggregates simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=yr, survey_filename=survey_filename) simulation.set_param() simulation.compute() agg = Aggregates() agg.set_simulation(simulation) agg.compute() df = agg.aggr_frame df['year'] = year label_by_name = dict( (name, column.label) for name, column in simulation.output_table.column_by_name.iteritems()) #colonnes = simulation.output_table.table.columns dfs.append(df) variables = agg.varlist labels_variables = [label_by_name[variable] for variable in variables] del simulation, agg, df gc.collect() #Getting ERF aggregates from ERF table temp = build_erf_aggregates(variables=variables, year=year) temp.rename(columns=label_by_name, inplace=True) temp = temp.T temp.reset_index(inplace=True) temp['year'] = year dfs_erf.append(temp) del temp gc.collect() print 'Out of data fetching for year ' + str(year) print 'Out of data fetching' datatest = reshape_tables(dfs, dfs_erf) save_as_xls(datatest, alter_method=False)