def test_add_quantile_trow_var(): dfx = pd.DataFrame(data=DATA, columns=['expanded_income', 's006', 'label']) dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=False, weight_by_income_measure=False) bin_labels = dfb['table_row'].unique() default_labels = set(range(1, 101)) for lab in bin_labels: assert lab in default_labels dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=False) assert 'table_row' in dfb with pytest.raises(ValueError): dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=True)
def test_add_quantile_trow_var(): dfx = pd.DataFrame(data=DATA, columns=['expanded_income', 's006', 'label']) dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=False, weight_by_income_measure=False) bin_labels = dfb['table_row'].unique() default_labels = set(range(1, 101)) for lab in bin_labels: assert lab in default_labels dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=False) assert 'table_row' in dfb with pytest.raises(ValueError): dfb = add_quantile_table_row_variable(dfx, 'expanded_income', 100, decile_details=True)
def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ Create additional df2 columns. If do_fuzzing is True, also fuzz some df2 records in each bin defined by bin_type and imeasure with the fuzzed records having their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg' if bin_type == 'dec': df2 = add_quantile_table_row_variable(df2, imeasure, 10, decile_details=True) gdf2 = df2.groupby('table_row') del df2['table_row'] elif bin_type == 'bin': df2 = add_income_table_row_variable(df2, imeasure, bins=STANDARD_INCOME_BINS) gdf2 = df2.groupby('table_row') del df2['table_row'] else: gdf2 = df2 if do_fuzzing: df2['nofuzz'] = gdf2['mask'].transform(chooser) else: # never do any results fuzzing df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col])
def write_decile_table(dfx, tfile, tkind='Totals'): """ Write to tfile the tkind decile table using dfx DataFrame. """ dfx = add_quantile_table_row_variable(dfx, 'expanded_income', 10, decile_details=False, pop_quantiles=False, weight_by_income_measure=False) gdfx = dfx.groupby('table_row', as_index=False) rtns_series = gdfx.apply(unweighted_sum, 's006').values[:, 1] xinc_series = gdfx.apply(weighted_sum, 'expanded_income').values[:, 1] itax_series = gdfx.apply(weighted_sum, 'iitax').values[:, 1] ptax_series = gdfx.apply(weighted_sum, 'payrolltax').values[:, 1] htax_series = gdfx.apply(weighted_sum, 'lumpsum_tax').values[:, 1] ctax_series = gdfx.apply(weighted_sum, 'combined').values[:, 1] # write decile table to text file row = 'Weighted Tax {} by Baseline Expanded-Income Decile\n' tfile.write(row.format(tkind)) rowfmt = '{}{}{}{}{}{}\n' row = rowfmt.format(' Returns', ' ExpInc', ' IncTax', ' PayTax', ' LSTax', ' AllTax') tfile.write(row) row = rowfmt.format(' (#m)', ' ($b)', ' ($b)', ' ($b)', ' ($b)', ' ($b)') tfile.write(row) rowfmt = '{:9.2f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}\n' for decile in range(0, 10): row = '{:2d}'.format(decile) row += rowfmt.format(rtns_series[decile] * 1e-6, xinc_series[decile] * 1e-9, itax_series[decile] * 1e-9, ptax_series[decile] * 1e-9, htax_series[decile] * 1e-9, ctax_series[decile] * 1e-9) tfile.write(row) row = ' A' row += rowfmt.format(rtns_series.sum() * 1e-6, xinc_series.sum() * 1e-9, itax_series.sum() * 1e-9, ptax_series.sum() * 1e-9, htax_series.sum() * 1e-9, ctax_series.sum() * 1e-9) tfile.write(row) del gdfx del rtns_series del xinc_series del itax_series del ptax_series del htax_series del ctax_series gc.collect()
def write_decile_table(dfx, tfile, tkind='Totals'): """ Write to tfile the tkind decile table using dfx DataFrame. """ dfx = add_quantile_table_row_variable(dfx, 'expanded_income', 10, decile_details=False, weight_by_income_measure=False) gdfx = dfx.groupby('table_row', as_index=False) rtns_series = gdfx.apply(unweighted_sum, 's006') xinc_series = gdfx.apply(weighted_sum, 'expanded_income') itax_series = gdfx.apply(weighted_sum, 'iitax') ptax_series = gdfx.apply(weighted_sum, 'payrolltax') htax_series = gdfx.apply(weighted_sum, 'lumpsum_tax') ctax_series = gdfx.apply(weighted_sum, 'combined') # write decile table to text file row = 'Weighted Tax {} by Baseline Expanded-Income Decile\n' tfile.write(row.format(tkind)) rowfmt = '{}{}{}{}{}{}\n' row = rowfmt.format(' Returns', ' ExpInc', ' IncTax', ' PayTax', ' LSTax', ' AllTax') tfile.write(row) row = rowfmt.format(' (#m)', ' ($b)', ' ($b)', ' ($b)', ' ($b)', ' ($b)') tfile.write(row) rowfmt = '{:9.2f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}\n' for decile in range(0, 10): row = '{:2d}'.format(decile) row += rowfmt.format(rtns_series[decile] * 1e-6, xinc_series[decile] * 1e-9, itax_series[decile] * 1e-9, ptax_series[decile] * 1e-9, htax_series[decile] * 1e-9, ctax_series[decile] * 1e-9) tfile.write(row) row = ' A' row += rowfmt.format(rtns_series.sum() * 1e-6, xinc_series.sum() * 1e-9, itax_series.sum() * 1e-9, ptax_series.sum() * 1e-9, htax_series.sum() * 1e-9, ctax_series.sum() * 1e-9) tfile.write(row) del gdfx del rtns_series del xinc_series del itax_series del ptax_series del htax_series del ctax_series gc.collect()
def test_create_tables(cps_subsample): # pylint: disable=too-many-statements,too-many-branches # create a current-law Policy object and Calculator object calc1 rec = Records.cps_constructor(data=cps_subsample) pol = Policy() calc1 = Calculator(policy=pol, records=rec) calc1.calc_all() # create a policy-reform Policy object and Calculator object calc2 reform = {2013: {'_II_rt1': [0.15]}} pol.implement_reform(reform) calc2 = Calculator(policy=pol, records=rec) calc2.calc_all() test_failure = False # test creating various difference tables diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES), calc2.dataframe(DIFF_VARIABLES), groupby='large_income_bins', income_measure='expanded_income', tax_to_diff='combined') assert isinstance(diff, pd.DataFrame) expected = [ np.nan, np.nan, -0.16, -0.57, -0.72, -0.69, -0.82, -0.80, -0.75, -0.65, -0.18, -0.59 ] tabcol = 'pc_aftertaxinc' if not np.allclose( diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES), calc2.dataframe(DIFF_VARIABLES), groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='iitax') assert isinstance(diff, pd.DataFrame) expected = [ np.nan, np.nan, -0.16, -0.57, -0.72, -0.69, -0.82, -0.80, -0.75, -0.65, -0.23, -0.09, -0.06, -0.59 ] tabcol = 'pc_aftertaxinc' if not np.allclose( diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES), calc2.dataframe(DIFF_VARIABLES), groupby='small_income_bins', income_measure='expanded_income', tax_to_diff='iitax') assert isinstance(diff, pd.DataFrame) expected = [ np.nan, np.nan, -0.30, -0.10, -0.24, -0.76, -0.67, -0.75, -0.69, -0.82, -0.80, -0.75, -0.65, -0.23, -0.09, -0.08, -0.07, -0.05, -0.02, np.nan, -0.59 ] tabcol = 'pc_aftertaxinc' if not np.allclose( diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES), calc2.dataframe(DIFF_VARIABLES), groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='combined') assert isinstance(diff, pd.DataFrame) expected = [ 0, 0, 1219678, 15503037, 25922077, 35000592, 48336897, 62637728, 79750078, 93136108, 116996252, 102458801, 580961247, 63156380, 33664610, 5637811 ] tabcol = 'tot_change' if not np.allclose(diff[tabcol].values, expected, atol=0.51, rtol=0.0): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.0f},'.format(val)) expected = [ 0.00, 0.00, 0.21, 2.67, 4.46, 6.02, 8.32, 10.78, 13.73, 16.03, 20.14, 17.64, 100.00, 10.87, 5.79, 0.97 ] tabcol = 'share_of_change' if not np.allclose(diff[tabcol].values, expected, atol=0.005, rtol=0.0): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) expected = [ np.nan, np.nan, -0.15, -0.62, -0.70, -0.73, -0.78, -0.80, -0.80, -0.74, -0.71, -0.30, -0.59, -0.55, -0.25, -0.06 ] tabcol = 'pc_aftertaxinc' if not np.allclose( diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) expected = [ np.nan, np.nan, -0.15, -0.62, -0.70, -0.73, -0.78, -0.80, -0.80, -0.74, -0.71, -0.30, -0.59, -0.55, -0.25, -0.06 ] tabcol = 'pc_aftertaxinc' if not np.allclose( diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): test_failure = True print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) # test creating various distribution tables dvdf = calc2.distribution_table_dataframe() dvdf = add_quantile_table_row_variable(dvdf, 'expanded_income', num_quantiles=10, decile_details=True) dist = create_distribution_table(dvdf, groupby='weighted_deciles', income_measure='expanded_income', result_type='weighted_sum') assert isinstance(dist, pd.DataFrame) expected = [ 0, 0, -53644343, -65258622, -57617119, 37391333, 200879230, 329784586, 553827330, 1015854407, 1731283600, 7090603505, 10783103907, 1638192777, 2213960052, 3238450675 ] tabcol = 'iitax' if not np.allclose(dist[tabcol].values, expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) expected = [ 0, 0, 2561, 12610, 21936, 29172, 50890, 61563, 78247, 91823, 118523, 128886, 596211, 63986, 51634, 13266 ] tabcol = 'num_returns_ItemDed' if not np.allclose(dist[tabcol].tolist(), expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) expected = [ 0, 0, 835224673, 2639667638, 3940559051, 5286856071, 6972849344, 8881099529, 11467767759, 14761195525, 19832126806, 44213000235, 118830346631, 14399218059, 16868648076, 12945134101 ] tabcol = 'expanded_income' if not np.allclose(dist[tabcol].tolist(), expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) expected = [ 0, 0, 818813684, 2466000535, 3671150517, 4790979126, 6173998985, 7754183496, 9907604744, 12510477225, 16273592612, 33915377411, 98282178334, 11345456373, 13400757263, 9169163776 ] tabcol = 'aftertax_income' if not np.allclose(dist[tabcol].tolist(), expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) dist = create_distribution_table(calc2.distribution_table_dataframe(), groupby='standard_income_bins', income_measure='expanded_income', result_type='weighted_sum') assert isinstance(dist, pd.DataFrame) expected = [ 0, 0, -42244205, -76727831, -62581860, 53797887, 217016689, 723516183, 1108097059, 3272479928, 2818979541, 950296405, 1820474110, 10783103907 ] tabcol = 'iitax' if not np.allclose(dist[tabcol], expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) expected = [ 0, 0, 1202, 13614, 27272, 34407, 48265, 117225, 103319, 181885, 61014, 5126, 2882, 596211 ] tabcol = 'num_returns_ItemDed' if not np.allclose(dist[tabcol].tolist(), expected, atol=0.5, rtol=0.0): test_failure = True print('dist', tabcol) for val in dist[tabcol].values: print('{:.0f},'.format(val)) if test_failure: assert 1 == 2
def fuzzed(df1, df2, reform_affected, table_row_type): """ Create fuzzed df2 dataframe and corresponding unfuzzed df1 dataframe. Parameters ---------- df1: Pandas DataFrame contains results variables for the baseline policy, which are not changed by this function df2: Pandas DataFrame contains results variables for the reform policy, which are not changed by this function reform_affected: boolean numpy array (not changed by this function) True for filing units with a reform-induced combined tax difference; otherwise False table_row_type: string valid values are 'aggr', 'xbin', and 'xdec' Returns ------- df1, df2: Pandas DataFrames where copied df2 is fuzzed to maintain data privacy and where copied df1 has same filing unit order as has the fuzzed df2 """ assert (table_row_type == 'aggr' or table_row_type == 'xbin' or table_row_type == 'xdec') assert len(df1.index) == len(df2.index) assert reform_affected.size == len(df1.index) df1 = copy.deepcopy(df1) df2 = copy.deepcopy(df2) # add copy of reform_affected to df2 df2['reform_affected'] = copy.deepcopy(reform_affected) # construct table rows, for which filing units in each row must be fuzzed if table_row_type == 'xbin': df1 = add_income_table_row_variable(df1, 'expanded_income', STANDARD_INCOME_BINS) df2['expanded_income_baseline'] = df1['expanded_income'] df2 = add_income_table_row_variable(df2, 'expanded_income_baseline', STANDARD_INCOME_BINS) del df2['expanded_income_baseline'] elif table_row_type == 'xdec': df1 = add_quantile_table_row_variable(df1, 'expanded_income', 10, decile_details=True) df2['expanded_income_baseline'] = df1['expanded_income'] df2 = add_quantile_table_row_variable(df2, 'expanded_income_baseline', 10, decile_details=True) del df2['expanded_income_baseline'] elif table_row_type == 'aggr': df1['table_row'] = np.ones(reform_affected.shape, dtype=int) df2['table_row'] = df1['table_row'] gdf1 = df1.groupby('table_row', sort=False) gdf2 = df2.groupby('table_row', sort=False) del df1['table_row'] del df2['table_row'] # fuzz up to NUM_TO_FUZZ filing units randomly chosen in each group # (or table row), where fuzz means to replace the reform (2) results # with the baseline (1) results for each chosen filing unit pd.options.mode.chained_assignment = None group_list = list() for name, group2 in gdf2: indices = np.where(group2['reform_affected']) num = min(len(indices[0]), NUM_TO_FUZZ) if num > 0: choices = np.random.choice( indices[0], # pylint: disable=no-member size=num, replace=False) group1 = gdf1.get_group(name) for idx in choices: group2.iloc[idx] = group1.iloc[idx] group_list.append(group2) df2 = pd.concat(group_list) del df2['reform_affected'] pd.options.mode.chained_assignment = 'warn' # reinstate index order of df1 and df2 and return df1.sort_index(inplace=True) df2.sort_index(inplace=True) return (df1, df2)
def summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan mask is the boolean array specifying records with reform-induced tax diffs returns dictionary of summary results DataFrames """ # pylint: disable=too-many-statements,too-many-locals df2_xdec, df2_xbin, df2_aggr = create_results_columns(df1, df2, mask) df1_xdec = add_quantile_table_row_variable(df1, 'expanded_income', 10, decile_details=True) del df1_xdec['table_row'] df1_xbin = add_income_table_row_variable(df1, 'expanded_income', bins=STANDARD_INCOME_BINS) del df1_xbin['table_row'] summ = dict() # tax difference totals between reform and baseline tdiff = df2_aggr['iitax_agg'] - df1['iitax'] aggr_itax_d = (tdiff * df2['s006']).sum() tdiff = df2_aggr['payrolltax_agg'] - df1['payrolltax'] aggr_ptax_d = (tdiff * df2['s006']).sum() tdiff = df2_aggr['combined_agg'] - df1['combined'] aggr_comb_d = (tdiff * df2['s006']).sum() aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] summ['aggr_d'] = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES) # totals for baseline aggr_itax_1 = (df1['iitax'] * df1['s006']).sum() aggr_ptax_1 = (df1['payrolltax'] * df1['s006']).sum() aggr_comb_1 = (df1['combined'] * df1['s006']).sum() aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] summ['aggr_1'] = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES) # totals for reform aggr_itax_2 = (df2_aggr['iitax_agg'] * df2['s006']).sum() aggr_ptax_2 = (df2_aggr['payrolltax_agg'] * df2['s006']).sum() aggr_comb_2 = (df2_aggr['combined_agg'] * df2['s006']).sum() aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] summ['aggr_2'] = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES) del df1 del df2 # create difference tables grouped by xdec df2_xdec['iitax'] = df2_xdec['iitax_xdec'] summ['diff_itax_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='iitax') df2_xdec['payrolltax'] = df2_xdec['payrolltax_xdec'] summ['diff_ptax_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='payrolltax') df2_xdec['combined'] = df2_xdec['combined_xdec'] summ['diff_comb_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='combined') # create difference tables grouped by xbin df2_xbin['iitax'] = df2_xbin['iitax_xbin'] diff_itax_xbin = \ create_difference_table(df1_xdec, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='iitax') summ['diff_itax_xbin'] = diff_itax_xbin df2_xbin['payrolltax'] = df2_xbin['payrolltax_xbin'] diff_ptax_xbin = \ create_difference_table(df1_xbin, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='payrolltax') summ['diff_ptax_xbin'] = diff_ptax_xbin df2_xbin['combined'] = df2_xbin['combined_xbin'] diff_comb_xbin = \ create_difference_table(df1_xbin, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='combined') summ['diff_comb_xbin'] = diff_comb_xbin # create distribution tables grouped by xdec summ['dist1_xdec'] = \ create_distribution_table(df1_xdec, groupby='weighted_deciles', income_measure='expanded_income', result_type='weighted_sum') suffix = '_xdec' df2_cols_with_suffix = [c for c in list(df2_xdec) if c.endswith(suffix)] for col in df2_cols_with_suffix: root_col_name = col.replace(suffix, '') df2_xdec[root_col_name] = df2_xdec[col] df2_xdec['expanded_income_baseline'] = df1_xdec['expanded_income'] summ['dist2_xdec'] = \ create_distribution_table(df2_xdec, groupby='weighted_deciles', income_measure='expanded_income_baseline', result_type='weighted_sum') # create distribution tables grouped by xbin dist1_xbin = \ create_distribution_table(df1_xbin, groupby='standard_income_bins', income_measure='expanded_income', result_type='weighted_sum') summ['dist1_xbin'] = dist1_xbin suffix = '_xbin' df2_cols_with_suffix = [c for c in list(df2_xbin) if c.endswith(suffix)] for col in df2_cols_with_suffix: root_col_name = col.replace(suffix, '') df2_xbin[root_col_name] = df2_xbin[col] df2_xbin['expanded_income_baseline'] = df1_xbin['expanded_income'] dist2_xbin = \ create_distribution_table(df2_xbin, groupby='standard_income_bins', income_measure='expanded_income_baseline', result_type='weighted_sum') summ['dist2_xbin'] = dist2_xbin # return dictionary of summary results return summ