def test_vat_file( filename , code_column_name , fraction_that_should_be_non_null ): def non_null_part( column ): return column[ ~ pd.isnull( column ) ] df = oio.readStage( cl.subsample , filename + "." + cl.strategy_suffix ) assert unique( df.columns ) # The special motorcycle tax, abusivelyed lump into the VAT table, # means the max "vat" is 0.27 rather than 0.19. assert df["vat"].min() >= 0 assert df["vat"].max() < 0.3 assert set( df.columns ) == set( [code_column_name, 'vat', 'vat frac'] ) for c in df.columns: if c == code_column_name: assert df[c].dtype == "int64" else: assert df[c].dtype == "float64" # The "vat" and "vat frac" columns might have a few missing values. # The others should have none. assert ( ( len( non_null_part( df["vat"] ) ) / len( df ) ) > ( fraction_that_should_be_non_null - tolerance ) ) assert ( len( non_null_part( df["vat"] ) ) == len( non_null_part( df["vat frac"] ) ) ) df = df.drop( columns = ["vat", "vat frac"] ) for c in df.columns: assert len( df[ ~ pd.isnull( df[c] ) ] ) == len( df )
if True: import pandas as pd import python.build.output_io as oio import python.common.common as com ppl = oio.readStage(com.subsample, 'people_1') dp = ppl[["household", "used savings"]].copy() dp["one"] = 1 dh = (dp.groupby("household").agg("sum"))
if True: import os import pandas as pd import sys from typing import List, Tuple # import python.build.output_io as oio import python.common.common as com import python.common.describe as desc import python.common.misc as c import python.draw.util as draw import python.report.defs as defs if True: # load data households = oio.readStage( com.subsample, "households_2_purchases." + com.strategy_year_suffix) earners = oio.readStage( com.subsample, "people_4_post_households." + com.strategy_year_suffix) if True: # Create a few columns missing in the input data. # TODO ? Move upstream. for df in [households, earners]: df["income, labor + cesantia"] = (df["income, labor"] + df["income, cesantia"]) df["income-percentile-in[90,97]"] = ((df["income-percentile"] >= 90) & (df["income-percentile"] <= 97)) df["income < min wage"] = (df["income"] < c.min_wage)
# Merge the building data innto the person-level data. if True: import sys import pandas as pd # import python.build.output_io as oio import python.common.util as util import python.common.common as common if True: # merge people, buildings buildings = oio.readStage( 1 # PITFALL: For buildings, we always use the full sample. , 'buildings', dtype={"estrato": 'float64'}) people = oio.readStage(common.subsample, 'people_1') people = pd.merge(people, buildings, how="left", on="household") if True: # make some new variables people["age-decile"] = pd.qcut(people["age"], 10, labels=False, duplicates='drop') people["income-decile"] = ( # PITFALL: there's a different such variable at the household level util.noisyQuantile(10, 0, 1, people["income"])) people["female head"] = people["female"] * (people["household-member"] == 1) # PITFALL: As noted earlier, the buildings data is always drawn from the full
from matplotlib.ticker import EngFormatter if True: # more imports import sys import os import numpy as np from functools import reduce # import python.common.util as util import python.draw.util as draw import python.build.output_io as oio import python.build.common as c vat_pics_dir = ("output/vat/pics/recip-" + str(c.subsample) + "/" + c.strategy_suffix + "/") if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir) households = oio.readStage(c.subsample, 'households.' + c.strategy_suffix) households_decile_summary = oio.readStage( c.subsample, 'households_decile_summary.' + c.strategy_suffix) if True: # single series plt.close() draw.single_cdf(households["members"], "Household size", xmax=10) draw.savefig(vat_pics_dir + "households", "size") plt.close() draw.single_cdf(households["transactions"], "Transactions per month", xmax=150) draw.savefig(vat_pics_dir + "households", "transactions-per-month") plt.close()
# This creates a single key from a collection of keys, # so that a dataset can be compared to a previous one using csv-diff, e.g.: # csv-diff old.csv new.csv --key="id" if True: import sys import pandas as pd # import python.build.output_io as oio import python.common.common as cl p4 = oio.readStage(cl.subsample, "people_3_income_taxish." + cl.strategy_year_suffix) p4["id"] = (p4["household"].astype(str) + ":" + p4["household-member"].astype(str)) p4.to_csv("old.csv")
if True: import pandas as pd import numpy as np # import python.build.classes as cl import python.build.output_io as oio import python.common.common as com import python.common.util as util if True: hh_cols = oio.readStage( com.subsample, "households_1_agg_plus." + com.strategy_year_suffix, nrows = 1 ) hh_rows = oio.readStage( com.subsample, "households_1_agg_plus." + com.strategy_year_suffix, usecols = ["household"] ) pur = oio.readStage( com.subsample, "purchase_sums." + com.strategy_suffix ) merge = oio.readStage( com.subsample, "households_2_purchases." + com.strategy_year_suffix ) if True: # See people_2_buildings_test for how to use these definitions. assert util.unique( merge.columns ) new_cols = [ "vat / purchase value", "vat / income", "purchase value / income" ]
# enable the previous line if calling from the (non-gui) shell import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter if True: # more imports import sys import os import numpy as np # import python.build.output_io as oio import python.draw.util as draw import python.build.common as c vat_pics_dir = "output/vat/pics/recip-" + str(c.subsample) + "/" + c.strategy_suffix + "/" if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir) purchases = oio.readStage( c.subsample, 'purchases_2_vat.' + c.strategy_suffix ) if True: # purchase quantity, logx and linear plt.close() draw.single_cdf( purchases["quantity"], "CDF of quantity per purchase", xmin = 1, xmax = 1e3) plt.gca().xaxis.set_major_formatter(EngFormatter(places=2)) draw.savefig( vat_pics_dir + "purchases" , "quantity" ) plt.close() draw.single_cdf( purchases["quantity"], "CDF of quantity per purchase", xmin = 1, logx = True) draw.savefig( vat_pics_dir + "purchases/logx" , "quantity" )
# people["dependent"] # r2018 . income_taxes( ppl ) # # # ### how to test # Restrict the file to dependents. # Mark everyone as having a dependent. # Compute everyone's taxes. # Restrict to people with "tax, income" > 0. # Is the set empty? if True: import pandas as pd # import python.common.common as com import python.build.output_io as oio import python.regime.r2018 as reg import python.build.ss_functions as ss ppl = oio.readStage(com.subsample, "people_3_purchases." + com.strategy_suffix) if True: ppl = ppl[ppl["dependent"]] ppl = ss.mk_ss_contribs(ppl) ppl["claims dependent (labor income tax)"] = False ppl = reg.income_taxes(ppl) len(ppl[ppl["tax, income"] > 0]) rich_deps = ppl[ppl["tax, income"] > 0].copy() rich_deps["tax, income"].describe()
if True: import pandas as pd import numpy as np # import python.common.util as util import python.build.output_io as oio import python.common.common as com import python.build.people_4_post_households_defs as defs if True: # input hs = oio.readStage(com.subsample, "households_2_purchases." + com.strategy_year_suffix) ps = oio.readStage(com.subsample, 'people_3_income_taxish.' + com.strategy_year_suffix) if True: # Prepare to merge. hs = hs.rename(columns={"income": "income, household"}) if True: # Merge people and households. m = pd.merge(left=ps, right=hs[defs.columns_to_pull_from_hs], on="household") earners = m[((m["in labor force"] == 1) & (m["age"] >= 18)) | (m["income"] > 0)] del (m) if True: # Make new variables, esp. create person-level purchase-like earners["share"] = np.where( # The fraction of purchaselike variables # attributed to this household adult. earners["income, household"] <= 0, # the condition
import sys import pandas as pd import python.common.util as util import python.build.output_io as oio from python.build.people.files import edu_key import python.common.misc as c import python.common.common as c people = oio.readStage(c.subsample, "people_3_purchases") households = oio.readStage(c.subsample, "households") purchase_sums = oio.readStage(c.subsample, "purchase_sums") if False: people["edu"] = pd.Categorical(people["edu"], categories=list(edu_key.values()), ordered=True) people["edu"] = util.interpretCategorical(people["edu"], edu_key.values())
if True: import numpy as np from itertools import chain # from python.build.classes import Correction import python.build.output_io as oio import python.build.purchases.correct_defs as defs import python.common.common as cl import python.common.misc as com # # input files import python.build.purchases.nice_purchases as nice_purchases import python.build.purchases.articulos as articulos import python.build.purchases.capitulo_c as capitulo_c purchases = oio.readStage(cl.subsample, 'purchases_0') for c in ( # PITFALL: Any correction reliant on a column's being a number # cannot be trusted to work here. Put it later in the program, # after running `all_columns_to_numbers`. [ Correction.Replace_Substring_In_Column("quantity", ",", "."), Correction.Replace_Missing_Values("quantity", 1), Correction.Replace_Missing_Values("per month", 1), Correction.Change_Column_Type("coicop", str), Correction.Replace_Entirely_If_Substring_Is_In_Column( "coicop", "inv", np.nan) ] + list( chain.from_iterable([ # chain.from_iterable concatenates its argument's members
# In a previous incarnation of tax.co, # the ORDEN variable was assumed to mean the same thing in the purchase data that it means in the person data: # a unique-within-household identifier of persons. # This code explores the effect that has on estimated household spending. if True: import numpy as np import pandas as pd # import python.build.classes as cla import python.build.purchases.legends as legends import python.build.output_io as oio import python.common.common as com import python.common.util as util pur = oio.readStage( # the last purchases-level data set com.subsample, "purchases_2_vat." + com.strategy_suffix) ppl = oio.readStage( # the first person-level data set com.subsample, 'people_1', usecols=["household", "household-member"]) hh = (ppl.groupby("household").agg({ "household-member": "max" }).reset_index().rename(columns={"household-member": "max member"})) pur["n purchases"] = 1 hh_pur = (pur.groupby("household").agg({ "household-member": "max", "n purchases": "sum" }).reset_index().rename(columns={"household-member": "max orden"}))
for (c, t) in [("recently bought this house", cla.InSet({True, False})), ("recently bought this house", cla.CoversRange(0, 1)), ("recently bought this house", cla.MeanBounds(0, 0.01)), ("recently bought this house", cla.MissingAtMost(0)), ("estrato", cla.InRange(0, 6)), ("estrato", cla.CoversRange(0, 3)), ("estrato", cla.MeanBounds(1.5, 2.5)), ("estrato", cla.MissingAtMost(0.02))]: assert t.test(bs[c]) if True: # run tests log = "starting\n" bs = oio.readStage( 1 # PITFALL: For buildings, we always use the full sample. , 'buildings', dtype={"estrato": 'float64'} # If subsample is so small that there are no missing values, # "estrato" will by default be read as "int64". ) test_types(bs) test_nullity(bs) test_ranges(bs) assert (unique(bs.columns)) for ss in com.valid_subsamples: # PITFALL: Looping over subsample sizes because this program # always uses the full sample. # If it works, it works for all subsamples. oio.test_write(ss, "build_buildings", log)
import sys import pandas as pd # import python.build.ss_functions as ss import python.build.output_io as oio import python.common.util as util import python.common.common as com # import python.build.people_3_income_taxish_functions as f4 if com.regime_year == 2016: import python.regime.r2016 as regime elif com.regime_year == 2018: import python.regime.r2018 as regime else: import python.regime.r2019 as regime ppl = oio.readStage( com.subsample , "people_2_buildings" ) ppl = ss.mk_ss_contribs(ppl) ppl = f4.insert_has_dependent_column(ppl) ppl = regime.income_taxes( ppl ) oio.saveStage( com.subsample , ppl , 'people_3_income_taxish.' + com.strategy_year_suffix )
# But it's surprising, because for subsample = 10, # the reality is much less than the expectation. assert (set(df.columns) == set(Purchase_2_Columns_missing.all_columns())) # coicop and 25-broad-categs are each individually missing substantially, # but exactly one of them is always present assert len(df[(~pd.isnull(df["coicop"])) & (~pd.isnull(df["25-broad-categs"]))]) == 0 assert len(df[(pd.isnull(df["coicop"])) | (pd.isnull(df["25-broad-categs"]))]) == len(df) for c in Purchase_2_Columns_missing.never: assert (len(df[pd.isnull(df[c])]) == 0) for c in Purchase_2_Columns_missing.slightly: assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.03) for c in Purchase_2_Columns_missing.very: assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.25) return log if True: # IO log = "starting\n" ps = oio.readStage(com.subsample, "purchases_2_vat." + com.strategy_suffix) log += test_ranges(ps) log += test_output(ps) oio.test_write(com.subsample, "build_purchases_2_vat", log)
# TODO: divide into sub-modules if True: import numpy as np import pandas as pd import re as regex # import python.build.classes as cla import python.build.output_io as oio import python.build.people.main_defs as defs from python.build.people.empleados import generar_empleados import python.build.people.files as files import python.common.common as cl import python.common.misc as c ppl = oio.readStage(cl.subsample, 'people_0') ppl = ppl.drop( # drop non-members of household ppl[ppl["relationship"].isin([6, 7, 8])].index) if True: # make independiente a 0 or a 1 ppl["independiente"] = ppl["independiente"].apply(lambda x: 1 if x in [4, 5] else 0) if True: # remap some boolean integers for cn in ( ["female"] + # originally 1=male, 2=female [included for (_, included) in files.inclusion_pairs] # Originally, 1 = included, 2 = omitted. # Now 0 = included, 1 = omitted. ):
import pandas as pd import numpy as np import python.build.output_io as oio subsample = 10 purchases = oio.readStage( subsample, "purchases_2_vat" ) purchases["purchases"] = 1 # When I check purchases[ purchases["coicop"] == x ] for these x, # the results are consistent with the coicop-vat bridge. # 11110103, 11110104, 11110105 # 1119807, 1119808, 1119809 # 1180103, 1180201, 1180301 ## vat per coicop p_sum = purchases.groupby( 'coicop' )[ "value" ] . agg( 'sum' ) p_first = purchases.groupby( 'coicop' )[ "vat, min" ] . agg( 'mean' ) p = pd.concat( [p_sum, p_first] , axis = 1 ) oio.saveStage( subsample, p, "vat-and-spending-per-coicop" , index = True ) ## vat per rate q_sum = purchases.groupby( 'vat, min' )[ "value" ] . agg( 'sum' )
log += "Very few missing quantity values." assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df))) log += "Very few negative quantity values." assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df))) log += "Negative quantity purchases are for very little money." assert (df[df["quantity"] < 0]["value"] < 1e4).all() log += "Very few purchases with a frequency of \"never\"." assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df))) log += "Those few frequency=\"never\" purchases are for very little money." assert (df[df["per month"] > 10]["value"] < 1e4).all() return log if True: # run the tests log = "starting\n" # unit tests log += test_drop_if_coicop_or_value_invalid() log += test_drop_absurdly_big_expenditures() # integration test df = oio.readStage(com.subsample, 'purchases_1') log += test_output(df) oio.test_write(com.subsample, "purchases_correct", log)
("has-child", cla.MeanBounds(0.4, 0.8)), ("has-elderly", cla.MeanBounds(0.1, 0.3)), ("used savings", cla.MeanBounds(0.03, 0.12)), # PITFALL: # Bigger than the mean from the people data, # because it varies within household. ("recently bought this house", cla.MeanBounds(0, 0.01)), ("female head", cla.MeanBounds(0.25, 0.55)), ("seguro de riesgos laborales", cla.MeanBounds(0.3, 0.6)) ]: assert test.test(hh[c]) if True: # IO log = "starting\n" # hh = oio.readStage(com.subsample, "households_1_agg_plus." + com.strategy_year_suffix) ppl = oio.readStage(com.subsample, "people_3_income_taxish." + com.strategy_year_suffix) hh["edu-max"] = util.interpretCategorical(hh["edu-max"], edu_key.values()) ppl["edu"] = util.interpretCategorical(ppl["edu"], edu_key.values()) test_const_within_group( # TODO ? move this test to the tests of person data gs=["household"], cs=defs.cols_const_within_hh, d=hh) test_indices(hh=hh, ppl=ppl) test_income_ranks(hh=hh, ppl=ppl) test_sums(hh=hh, ppl=ppl) test_bools(hh=hh, ppl=ppl) com_tests.test_quantiles(df=hh)
import sys import os from functools import reduce # import pandas as pd import numpy as np # import python.common.util as util import python.draw.util as draw import python.build.output_io as oio import python.build.common as c vat_pics_dir = "output/vat/pics/recip-" + str( c.subsample) + "/" + c.strategy_suffix + "/" if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir) people = oio.readStage(c.subsample, 'people_3_purchases.' + c.strategy_suffix) edu_key = { 1: "Ninguno", 2: "Preescolar", 3: "Basica\n Primaria", 4: "Basica\n Secundaria", 5: "Media", 6: "Superior o\n Universitaria", 9: "No sabe,\n no informa" } people["edu"] = pd.Categorical(people["edu"], categories=list(edu_key.values()), ordered=True) if True: # single series
# The only thing to check is the increase in the set of columns. # (Could check length, but a left merge cannot change that.) if True: import sys import pandas as pd # import python.build.classes as cl import python.build.output_io as oio import python.common.common as com import python.common.misc as misc import python.common.util as util in_cols = oio.readStage(com.subsample, "people_1", nrows=1) in_rows = oio.readStage(com.subsample, "people_1", usecols=["household"]) out = oio.readStage(com.subsample, 'people_2_buildings') cols1 = set(in_cols.columns) cols2 = set(out.columns) new_cols = { "estrato", 'recently bought this house', "region-1", "region-2", "age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC", "GCM", "female head" } assert util.unique(out.columns) assert util.unique(new_cols) assert set.intersection(cols1, new_cols) == set() assert set.union(cols1, new_cols) == cols2 assert set.difference(cols2, cols1) == new_cols
import pandas as pd from itertools import chain # import python.build.output_io as oio import python.common.common as cl import python.common.misc as c import python.common.describe as desc import python.draw.util as draw if cl.regime_year == 2016: import python.regime.r2016 as regime else: import python.regime.r2018 as regime if True: # Get, prepare the data hh = oio.readStage(cl.subsample, "households_2_purchases." + cl.strategy_year_suffix) hh["income-percentile-in[90,97]"] = ((hh["income-percentile"] >= 90) & (hh["income-percentile"] <= 97)) hh["income < min wage"] = (hh["income"] < c.min_wage) if True: # Sum the ss tax components, keep sum, drop components. ss_tax_components = [ "tax, ss, pension", "tax, ss, pension, employer", "tax, ss, salud", "tax, ss, salud, employer", "tax, ss, solidaridad", "tax, ss, parafiscales", "tax, ss, cajas de compensacion" ] hh["tax, ss"] = hh[ss_tax_components].sum(axis="columns") hh = hh.drop(columns=ss_tax_components) if True: # Narrow the set of columns basicVars = ["household", "weight"]
# TODO : automate these tests. import numpy as np import pandas as pd import python.build.output_io as oio import python.common.misc as c import python.common.common as cl people = oio.readStage( cl.subsample , 'people_3_income_taxish.' + cl.strategy_suffix ) ppl = people.rename( columns = { "relative, child" : "child" , "relative, non-child" : "rel" , "dependent" : "dep" , "disabled" : "disab" , "income, labor" : "labor" } ) # These should all have a mean of 1 ppl["dep"][ (ppl["student"] == 1) & (ppl["age"] < 24) ].mean() ppl["dep"][ (ppl["child"] == 1) & (ppl["age"] < 19) ].mean() ppl["dep"][ ((ppl["rel"]==1) & (ppl["labor"] < (260*c.uvt) ) ) ].mean() ppl["dep"][
# Beyond the shape of the data, there's nothing to test. if True: import pandas as pd # import python.build.classes as cla import python.build.output_io as oio import python.common.common as com from python.common.misc import num_households import python.common.util as util sums = oio.readStage( com.subsample, "purchase_sums." + com.strategy_suffix ) assert util.unique( sums.columns ) assert ( set( sums.columns ) == { "household", "value, tax, purchaselike non-VAT", "value, tax, predial", "value, tax, purchaselike non-predial non-VAT", "transactions", "value, non-purchase", "value, purchase", "value, spending", "value, consumption", "vat paid" } ) if com.subsample < 11: # The data is too sparse to test # the smaller samples this way
import python.build.classes as cla import python.build.purchases.legends as legends import python.build.output_io as oio import python.common.common as c import python.common.util as util if True: # input files purchases = oio.readStage( # Data is too big unless we down-cast the numbers # from 64-bit to 32-bit. c.subsample, "purchases_1", dtype={ "25-broad-categs": "float32", "coicop": "float32", "per month": "float32", "household": "int32", "household-member": "int32", "is-purchase": "float32", "quantity": "float32", "value": "float32", "weight": "float32", "where-got": "float32" }) vat_cap_c = (oio.readStage(c.subsample, "vat_cap_c_brief." + c.strategy_suffix, dtype={ "25-broad-categs": "int32", "vat": "float32", "vat frac": "float32"
# Incorporate sums of purchases into households. # Compute some more variables. if True: import pandas as pd import numpy as np # import python.common.util as util import python.build.output_io as oio import python.common.common as com if True: # merge purchase data into person data # PITFALL: The unit of observation in all these data sets is a household. hh = oio.readStage( com.subsample, "households_1_agg_plus." + com.strategy_year_suffix ) pur = oio.readStage( com.subsample, "purchase_sums." + com.strategy_suffix ) merge = pd.merge( hh, pur, how = "left", on=["household"] ) if True: # In San Andrés there is no VAT. merge.loc[ merge["region-1"] == "SAN ANDRÉS", "vat paid" ] = 0 if True: # create a few more variables merge["vat / purchase value" ] = ( merge["vat paid"] / merge["value, purchase" ] ) merge["vat / income"] = (