Exemplo n.º 1
0
def test_add_weighted_income_bins():
    dfx = pd.DataFrame(data=DATA, columns=['expanded_income', 's006', 'label'])
    dfb = add_weighted_income_bins(dfx, num_bins=100)
    bin_labels = dfb['bins'].unique()
    default_labels = set(range(1, 101))
    for lab in bin_labels:
        assert lab in default_labels
    # custom labels
    dfb = add_weighted_income_bins(dfx, weight_by_income_measure=True)
    assert 'bins' in dfb
    custom_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
    dfb = add_weighted_income_bins(dfx, labels=custom_labels)
    assert 'bins' in dfb
    bin_labels = dfb['bins'].unique()
    for lab in bin_labels:
        assert lab in custom_labels
Exemplo n.º 2
0
def dropq_dist_table(resdf, groupby, result_type, suffix):
    """
    Create and return dropq distribution table.
    """
    # pylint: disable=too-many-locals
    res = resdf
    c04470_s = 'c04470' + suffix
    c00100_s = 'c00100' + suffix
    c09600_s = 'c09600' + suffix
    standard_s = 'standard' + suffix
    s006_s = 's006' + suffix
    returns_ided_s = 'num_returns_ItemDed' + suffix
    returns_sded_s = 'num_returns_StandardDed' + suffix
    returns_amt_s = 'num_returns_AMT' + suffix
    res[c04470_s] = res[c04470_s].where(
        ((res[c00100_s] > 0) & (res[c04470_s] > res[standard_s])), 0)
    res[returns_ided_s] = res[s006_s].where(
        ((res[c00100_s] > 0) & (res[c04470_s] > 0)), 0)
    res[returns_sded_s] = res[s006_s].where(
        ((res[c00100_s] > 0) & (res[standard_s] > 0)), 0)
    res[returns_amt_s] = res[s006_s].where(res[c09600_s] > 0, 0)
    if groupby == "weighted_deciles":
        dframe = add_weighted_income_bins(res, num_bins=10)
    elif groupby == "small_income_bins":
        dframe = add_income_bins(res, compare_with="soi")
    elif groupby == "large_income_bins":
        dframe = add_income_bins(res, compare_with="tpc")
    elif groupby == "webapp_income_bins":
        dframe = add_income_bins(res, compare_with="webapp")
    else:
        err = ("groupby must be either 'weighted_deciles' or "
               "'small_income_bins' or 'large_income_bins' or "
               "'webapp_income_bins'")
        raise ValueError(err)
    pd.options.display.float_format = '{:8,.0f}'.format
    if result_type == "weighted_sum":
        dframe = weighted(dframe, [col + suffix for col in STATS_COLUMNS])
        gby_bins = dframe.groupby('bins', as_index=False)
        gp_mean = gby_bins[[col + suffix for col in TABLE_COLUMNS]].sum()
        gp_mean.drop('bins', axis=1, inplace=True)
        sum_row = get_sums(dframe)[[col + suffix for col in TABLE_COLUMNS]]
    elif result_type == "weighted_avg":
        gp_mean = weighted_avg_allcols(dframe,
                                       [col + suffix for col in TABLE_COLUMNS])
        all_sums = get_sums(dframe, not_available=True)
        sum_row = all_sums[[col + suffix for col in TABLE_COLUMNS]]
    else:
        err = ("result_type must be either 'weighted_sum' or "
               "'weighted_avg'")
        raise ValueError(err)
    return gp_mean.append(sum_row)
Exemplo n.º 3
0
 def write_decile_table(dfx, tfile, tkind='Totals'):
     """
     Write to tfile the tkind decile table using dfx DataFrame.
     """
     dfx = add_weighted_income_bins(dfx, num_bins=10,
                                    income_measure='expanded_income',
                                    weight_by_income_measure=False)
     gdfx = dfx.groupby('bins', as_index=False)
     rtns_series = gdfx.apply(unweighted_sum, 's006')
     xinc_series = gdfx.apply(weighted_sum, 'expanded_income')
     itax_series = gdfx.apply(weighted_sum, 'iitax')
     ptax_series = gdfx.apply(weighted_sum, 'payrolltax')
     htax_series = gdfx.apply(weighted_sum, 'lumpsum_tax')
     ctax_series = gdfx.apply(weighted_sum, 'combined')
     # write decile table to text file
     row = 'Weighted Tax {} by Expanded-Income Decile\n'
     tfile.write(row.format(tkind))
     rowfmt = '{}{}{}{}{}{}\n'
     row = rowfmt.format('    Returns',
                         '    ExpInc',
                         '    IncTax',
                         '    PayTax',
                         '     LSTax',
                         '    AllTax')
     tfile.write(row)
     row = rowfmt.format('       (#m)',
                         '      ($b)',
                         '      ($b)',
                         '      ($b)',
                         '      ($b)',
                         '      ($b)')
     tfile.write(row)
     rowfmt = '{:9.1f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}{:10.1f}\n'
     for decile in range(0, 10):
         row = '{:2d}'.format(decile)
         row += rowfmt.format(rtns_series[decile] * 1e-6,
                              xinc_series[decile] * 1e-9,
                              itax_series[decile] * 1e-9,
                              ptax_series[decile] * 1e-9,
                              htax_series[decile] * 1e-9,
                              ctax_series[decile] * 1e-9)
         tfile.write(row)
     row = ' A'
     row += rowfmt.format(rtns_series.sum() * 1e-6,
                          xinc_series.sum() * 1e-9,
                          itax_series.sum() * 1e-9,
                          ptax_series.sum() * 1e-9,
                          htax_series.sum() * 1e-9,
                          ctax_series.sum() * 1e-9)
     tfile.write(row)
Exemplo n.º 4
0
def dropq_diff_table(df1, df2, groupby, res_col, diff_col, suffix, wsum):
    """
    Create and return dropq difference table.
    """
    # pylint: disable=too-many-arguments,too-many-locals
    if groupby == "weighted_deciles":
        gdf = add_weighted_income_bins(df2, num_bins=10)
    elif groupby == "small_income_bins":
        gdf = add_income_bins(df2, compare_with="soi")
    elif groupby == "large_income_bins":
        gdf = add_income_bins(df2, compare_with="tpc")
    elif groupby == "webapp_income_bins":
        gdf = add_income_bins(df2, compare_with="webapp")
    else:
        err = ("groupby must be either 'weighted_deciles' or "
               "'small_income_bins' or 'large_income_bins' or "
               "'webapp_income_bins'")
        raise ValueError(err)
    # Difference in plans
    # Positive values are the magnitude of the tax increase
    # Negative values are the magnitude of the tax decrease
    df2[res_col + suffix] = df2[diff_col + suffix] - df1[diff_col]
    diffs = means_and_comparisons(res_col + suffix,
                                  gdf.groupby('bins', as_index=False),
                                  wsum + EPSILON)
    sum_row = get_sums(diffs)[diffs.columns]
    diffs = diffs.append(sum_row)  # pylint: disable=redefined-variable-type
    pd.options.display.float_format = '{:8,.0f}'.format
    srs_inc = ["{0:.2f}%".format(val * 100) for val in diffs['perc_inc']]
    diffs['perc_inc'] = pd.Series(srs_inc, index=diffs.index)
    srs_cut = ["{0:.2f}%".format(val * 100) for val in diffs['perc_cut']]
    diffs['perc_cut'] = pd.Series(srs_cut, index=diffs.index)
    srs_change = [
        "{0:.2f}%".format(val * 100) for val in diffs['share_of_change']
    ]
    diffs['share_of_change'] = pd.Series(srs_change, index=diffs.index)
    # columns containing weighted values relative to the binning mechanism
    non_sum_cols = [x for x in diffs.columns if 'mean' in x or 'perc' in x]
    for col in non_sum_cols:
        diffs.loc['sums', col] = 'n/a'
    return diffs
Exemplo n.º 5
0
def drop_records(df1, df2, mask):
    """
    Modify df1 and df2 by adding statistical fuzz for data privacy.

    Parameters
    ----------
    df1: Pandas DataFrame
        contains results for the standard plan X and X'.

    df2: Pandas DataFrame
        contains results for the user-specified plan (Plan Y).

    mask: boolean numpy array
        contains info about whether or not each element of X and X' are same

    Returns
    -------
    fuzzed_df1: Pandas DataFrame

    fuzzed_df2: Pandas DataFrame

    Notes
    -----
    This function groups both DataFrames based on the web application's
    income groupings (both weighted decile and income bins), and then
    pseudo-randomly picks three records to 'drop' within each bin.
    We keep track of the three dropped records in both group-by
    strategies and then use these 'flag' columns to modify all
    columns of interest, creating new '_dec' columns for
    statistics based on weighted deciles and '_bin' columns for
    statitistics based on income bins.  Lastly we calculate
    individual income tax differences, payroll tax differences, and
    combined tax differences between the baseline and reform
    for the two groupings.
    """
    # perform all statistics on (Y + X') - X

    # Group first
    df2['mask'] = mask
    df1['mask'] = mask

    df2 = add_weighted_income_bins(df2)
    df1 = add_weighted_income_bins(df1)
    gp2_dec = df2.groupby('bins')

    df2 = add_income_bins(df2, bins=WEBAPP_INCOME_BINS)
    df1 = add_income_bins(df1, bins=WEBAPP_INCOME_BINS)
    gp2_bin = df2.groupby('bins')

    # Transform to get the 'flag' column (3 choices to drop in each bin)
    df2['flag_dec'] = gp2_dec['mask'].transform(chooser)
    df2['flag_bin'] = gp2_bin['mask'].transform(chooser)

    # first calculate all of X'
    columns_to_make_noisy = set(TABLE_COLUMNS) | set(STATS_COLUMNS)
    # these don't exist yet
    columns_to_make_noisy.remove('num_returns_ItemDed')
    columns_to_make_noisy.remove('num_returns_StandardDed')
    columns_to_make_noisy.remove('num_returns_AMT')
    for col in columns_to_make_noisy:
        df2[col + '_dec'] = (df2[col] * df2['flag_dec'] -
                             df1[col] * df2['flag_dec'] + df1[col])
        df2[col + '_bin'] = (df2[col] * df2['flag_bin'] -
                             df1[col] * df2['flag_bin'] + df1[col])

    # Difference in plans
    # Positive values are the magnitude of the tax increase
    # Negative values are the magnitude of the tax decrease
    df2['tax_diff_dec'] = df2['iitax_dec'] - df1['iitax']
    df2['tax_diff_bin'] = df2['iitax_bin'] - df1['iitax']
    df2['payrolltax_diff_dec'] = df2['payrolltax_dec'] - df1['payrolltax']
    df2['payrolltax_diff_bin'] = df2['payrolltax_bin'] - df1['payrolltax']
    df2['combined_diff_dec'] = df2['combined_dec'] - df1['combined']
    df2['combined_diff_bin'] = df2['combined_bin'] - df1['combined']

    return df1, df2