def test_output(df): log = "test_output\n" assert (unique(df.columns)) assert near(len(df), num_purchases_surviving / com.subsample, tol_frac=(1 / 20 if com.subsample != 10 else 0.6)) # TODO | BUG? Why is the previous conditional necessary? That is, why, # in the special case of subsample = 1/10, is the size of the # purchase data so different from what you'd expect. # This isn't necessarily wrong, since the data is subsampled by households, # and households can make different numbers of purchases. # That's why `tol_frac` needs to be substantial in both cases. # But it's surprising, because for subsample = 10, # the reality is much less than the expectation. spec = { "where-got": cla.InRange(1, 26), "weight": cla.InRange(0, 1e4), "value": cla.InRange(0, 1e9), "quantity": cla.InRange(0, 1e8), "is-purchase": cla.InRange(0, 1), "household": cla.InRange(1, 1e7), "per month": cla.InRange(1, 11), "coicop": cla.InRange(1, 1e8), "25-broad-categs": cla.InRange(1, 25) } for k in spec: log += (" " + k + "\n") assert spec[k].test(df[k]) log += "Specs cover all column names." assert set(df.columns) == set(spec.keys()) log += "Very few missing quantity values." assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df))) log += "Very few negative quantity values." assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df))) log += "Negative quantity purchases are for very little money." assert (df[df["quantity"] < 0]["value"] < 1e4).all() log += "Very few purchases with a frequency of \"never\"." assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df))) log += "Those few frequency=\"never\" purchases are for very little money." assert (df[df["per month"] > 10]["value"] < 1e4).all() return log
def test_ranges(bs: pd.DataFrame) -> None: for (c, t) in [("recently bought this house", cla.InSet({True, False})), ("recently bought this house", cla.CoversRange(0, 1)), ("recently bought this house", cla.MeanBounds(0, 0.01)), ("recently bought this house", cla.MissingAtMost(0)), ("estrato", cla.InRange(0, 6)), ("estrato", cla.CoversRange(0, 3)), ("estrato", cla.MeanBounds(1.5, 2.5)), ("estrato", cla.MissingAtMost(0.02))]: assert t.test(bs[c])
def test_Property_subclasses(): assert (cla.MissingAtMost(0.5).test(pd.Series([1, np.nan, 3]))) assert not (cla.MissingAtMost(0.5).test(pd.Series([1, np.nan, np.nan]))) for (val, result) in [(-1, False), (0, True), (0.5, True), (1, True), (2, False), (np.nan, True)]: assert cla.InRange(0, 1).test(pd.Series([val])) == result assert cla.InSet({0, 0.5, 1}).test(pd.Series([val])) == result assert ((cla.CoversRange(0, 10).test(pd.Series([0, 10]))) & (cla.CoversRange(0, 10).test(pd.Series([0, 10]))) & (not cla.CoversRange(0, 10).test(pd.Series([1, 9]))))
"age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC", "GCM", "female head" } assert util.unique(out.columns) assert util.unique(new_cols) assert set.intersection(cols1, new_cols) == set() assert set.union(cols1, new_cols) == cols2 assert set.difference(cols2, cols1) == new_cols assert len(in_rows) == len(out) assert util.near(len(out), misc.num_people / com.subsample, tol_frac=1 / 5) per_cell_spec = { "age-decile": cl.InRange(0, 9), "income-decile": cl.InRange(0, 9), "female head": cl.InRange(0, 1) } per_column_spec = { "age-decile": cl.CoversRange(0, 9), "income-decile": cl.CoversRange(0, 9), "female head": cl.CoversRange(0, 1) } for k, v in per_cell_spec.items(): assert v.test(out[k]) for k, v in per_column_spec.items(): assert v.test(out[k])
def test_ranges(df): log = "test_ranges()\n" inRange_spec = { "25-broad-categs": cl.InRange(1, 25), "big-hog": cl.InRange(0, 1), "coicop": cl.InRange(1e6, 2e7), # PITFALL: "freq-code"=11 <=> the purchase is never made. # This corresponds to a "per month" value of np.nan. "freq-code": cl.InRange(0, 10), "household": cl.InRange(0, 1e6), "is-purchase": cl.InRange(0, 1), "per month": cl.InRange(1 / 36 - 0.001, 31), "quantity": cl.InRange(0, 1e8), "value": cl.InRange(0, 3e9), # The special motorcycle tax, abusivelyed lump into the VAT table, # means the max "vat" is 0.27 rather than 0.19. "vat": cl.InRange(0, 0.3), "vat frac": cl.InRange(0, 0.3 / 1.3), } for k, v in inRange_spec.items(): assert v.test(df[k]) coversRange_spec = { "household": cl.CoversRange(2e5, 6e5), "per month": cl.CoversRange(0.05, 30), "quantity": cl.CoversRange(1, 100), "value": cl.CoversRange(3, 1e6), "weight": cl.CoversRange(10, 1000), "where-got": cl.CoversRange(1, 25), # The special motorcycle tax, abusively lump into the VAT table, # means the max "vat" is 0.27 rather than 0.19. # *However*, in the smaller samples, # we can't be sure that whole range is covered: # there might be no motorcycle purchases. # That at least some purchase incurs a VAT of 0.19, though, is a safe bet. "vat frac": cl.CoversRange(0, 0.19 / 1.19), "vat paid": cl.CoversRange(0, 1e5), "vat": cl.CoversRange(0, 0.19), } for k, v in coversRange_spec.items(): assert v.test(df[k]) return log
"value, tax, predial", "value, tax, purchaselike non-predial non-VAT", "transactions", "value, non-purchase", "value, purchase", "value, spending", "value, consumption", "vat paid" } ) if com.subsample < 11: # The data is too sparse to test # the smaller samples this way for (c,ts) in [ ( "transactions", [ cla.MeanBounds ( 50 , 120 ), cla.CoversRange ( 2 , 200 ), cla.InRange ( 1 , 400 ), cla.MissingAtMost ( 0 ) ] ), ( "value, tax, purchaselike non-VAT", [ cla.MeanBounds (1e4 , 1e5), cla.CoversRange (0 , 2e6), cla.InRange (0 , 1.1e8), # someone pays a huge predial cla.MissingAtMost (0) ] ), ( "value, tax, predial", [ cla.MeanBounds (1e4 ,1e5), cla.CoversRange (0 ,1e3), cla.InRange (0 ,1.1e8), cla.MissingAtMost (0) ] ), ( "value, tax, purchaselike non-predial non-VAT",
"vat / income", "purchase value / income" ] assert ( len( merge.columns ) == len( hh_cols.columns ) + len( pur.columns ) - 1 + # omit the key that was merged on len( new_cols ) ) assert len( merge ) == len( hh_rows ) if True: assert (merge["region-1"] == "SAN ANDRÉS") . any() assert ( merge[ merge["region-1"] == "SAN ANDRÉS" ] ["vat paid"].max() == 0 ) if True: for k,v in { "vat / purchase value" : cl.InRange( 0, 0.3 ), # The special motorcycle tax, abusivelyed lump into the VAT table, # means the max "vat" is 0.27 rather than 0.19. "vat / income" : cl.InRange( 0, np.inf ), "purchase value / income" : cl.InRange( 0, np.inf ) }.items(): assert v.test( merge[k] ) for k,v in { # These bounds could be tighter, # but the 1/1000 subsample has a small range. "vat / purchase value" : cl.CoversRange( 0, 0.1 ), "vat / income" : cl.CoversRange( 0, np.inf ), "purchase value / income" : cl.CoversRange( 0.2, np.inf ) }.items(): assert v.test( merge[k] ) for k,v in {
def test_capitulo_c_data(df: pd.DataFrame): for t in [cla.InRange(1, 25), cla.CoversRange(1, 25)]: assert t.test(df["CODE"]) for col in ["vat", "vat, min", "vat, max"]: assert (cla.InRange(0, 1).test(df[col]))
def test_coicop_data(df: pd.DataFrame): for t in [cla.InRange(1e6, 2e7), cla.CoversRange(2e6, 1e7)]: assert t.test(df["coicop"]) for col in ["vat", "vat, min", "vat, max"]: assert (cla.InRange(0, 1).test(df[col]))
def test_ranges(ppl: pd.DataFrame): assert (util.unique(ppl.columns)) specs = { "household": cla.InRange(0, 1e7), "age": cla.InRange(0, 120), "edu": cla.InSet(set(files.edu_key.values())), "female": cla.InRange(0, 1), "household-member": cla.InRange(1, 50), "income, pension": cla.InRange(0, 3e8), "income, cesantia": cla.InRange(0, 1e8), "income, dividend": cla.InRange(0, 1e8), "independiente": cla.InRange(0, 1), "literate": cla.InRange(0, 1), "student": cla.InRange(0, 1), "weight": cla.InRange(0.001, 1e4), "pension, contributing (if not pensioned)": cla.InRange(0, 1), "pension, receiving": cla.InRange(0, 1), "pension, contributor(s) (if not pensioned) = split": cla.InRange(0, 1), "pension, contributor(s) (if not pensioned) = self": cla.InRange(0, 1), "pension, contributor(s) (if not pensioned) = employer": cla.InRange(0, 1), "seguro de riesgos laborales": cla.InRange(0, 1), "income, govt, cash": cla.InRange(0, 2e7), "income, govt, in-kind": cla.InRange(0, 1e7), "income, non-labor (tax def)": cla.InRange(0, 1e8), "income, rental + interest": cla.InRange(0, 1e9), "income, donacion": cla.InRange(0, 2e7), "income, infrequent": cla.InRange(0, 1e8), "income, ganancia ocasional, 10%-taxable": cla.InRange(0, 1e8), "income, ganancia ocasional, 20%-taxable": cla.InRange(0, 3e7), "income, labor, cash": cla.InRange(0, 3e9), "income, labor, in-kind": cla.InRange(0, 3e7), "income, cash": cla.InRange(0, 3e9), "income, in-kind": cla.InRange(0, 3e7), "income": cla.InRange(0, 3e9), "income, govt": cla.InRange(0, 3e7), "income, private": cla.InRange(0, 2e8), "income, labor": cla.InRange(0, 3e9), "income, borrowing": cla.InRange(0, 1e8), "rank, labor income": cla.InRange(1, 50), "empleado": cla.InRange(0, 1), "desempleado": cla.InRange(0, 1), "in labor force": cla.InRange(0, 1), "used savings": cla.InSet({True, False}), "disabled": cla.InSet({True, False}), "dependent": cla.InSet({True, False}), "race, indig": cla.InSet({True, False}), "race, git|rom": cla.InSet({True, False}), "race, raizal": cla.InSet({True, False}), "race, palenq": cla.InSet({True, False}), "race, neg|mul": cla.InSet({True, False}), "race, whi|mest": cla.InSet({True, False}) } for k in specs.keys(): assert specs[k].test(ppl[k])