def __init__(self, growfactors_filename=FILENAME): # read grow factors from specified growfactors_filename gfdf = pd.DataFrame() if isinstance(growfactors_filename, str): full_filename = os.path.join(GrowFactors.FILE_PATH, growfactors_filename) if os.path.isfile(full_filename): gfdf = pd.read_csv(full_filename, index_col='YEAR') else: # find file in conda package gfdf = read_egg_csv(os.path.basename(growfactors_filename), index_col='YEAR') # pragma: no cover else: raise ValueError('growfactors_filename is not a string') assert isinstance(gfdf, pd.DataFrame) # check validity of gfdf column names gfdf_names = set(list(gfdf)) if gfdf_names != GrowFactors.VALID_NAMES: msg = ('missing names are: {} and invalid names are: {}') missing = GrowFactors.VALID_NAMES - gfdf_names invalid = gfdf_names - GrowFactors.VALID_NAMES raise ValueError(msg.format(missing, invalid)) # determine first_year and last_year from gfdf self._first_year = min(gfdf.index) self._last_year = max(gfdf.index) # set gfdf as attribute of class self.gfdf = pd.DataFrame() setattr(self, 'gfdf', gfdf.astype(np.float64)) # pylint: disable=no-member del gfdf # specify factors as being unused (that is, not yet accessed) self.used = False
def _read_ratios(self, ratios): """ Read Records adjustment ratios from file or use specified transposed/no-index DataFrame as ratios or create empty DataFrame if None """ if ratios is None: setattr(self, 'ADJ', pd.DataFrame({'nothing': []})) return if isinstance(ratios, pd.DataFrame): assert 'INT2013' in ratios.columns # check for transposed assert ratios.index.name is None # check for no-index ADJ = ratios elif isinstance(ratios, str): ratios_path = os.path.join(Records.CUR_PATH, ratios) if os.path.isfile(ratios_path): ADJ = pd.read_csv(ratios_path, index_col=0) else: # cannot call read_egg_ function in unit tests ADJ = read_egg_csv(os.path.basename(ratios_path), index_col=0) # pragma: no cover ADJ = ADJ.transpose() else: msg = 'ratios is neither None nor a Pandas DataFrame nor a string' raise ValueError(msg) assert isinstance(ADJ, pd.DataFrame) if ADJ.index.name != 'agi_bin': ADJ.index.name = 'agi_bin' self.ADJ = pd.DataFrame() setattr(self, 'ADJ', ADJ.astype(np.float32)) del ADJ
def _read_weights(self, weights): """ Read Records weights from file or use specified DataFrame as data or create empty DataFrame if None. Assumes weights are integers equal to 100 times the real weight. """ if weights is None: setattr(self, 'WT', pd.DataFrame({'nothing': []})) return if isinstance(weights, pd.DataFrame): WT = weights elif isinstance(weights, str): weights_path = os.path.join(Records.CUR_PATH, weights) if os.path.isfile(weights_path): WT = pd.read_csv(weights_path) else: # cannot call read_egg_ function in unit tests WT = read_egg_csv( os.path.basename(weights_path)) # pragma: no cover else: msg = 'weights is not None or a string or a Pandas DataFrame' raise ValueError(msg) assert isinstance(WT, pd.DataFrame) setattr(self, 'WT', WT.astype(np.int32)) del WT
def read_cps_data(): """ Return data in cps.csv.gz as a Pandas DataFrame. """ fname = os.path.join(Records.CODE_PATH, 'cps.csv.gz') if os.path.isfile(fname): cpsdf = pd.read_csv(fname) else: # find file in conda package cpsdf = read_egg_csv(fname) # pragma: no cover return cpsdf
def read_cps_data(): """ Return data in cps.csv.gz as a Pandas DataFrame. """ fname = os.path.join(Records.CUR_PATH, 'cps.csv.gz') if os.path.isfile(fname): cpsdf = pd.read_csv(fname) else: # cannot call read_egg_ function in unit tests cpsdf = read_egg_csv(fname) # pragma: no cover return cpsdf
def _read_weights(self, weights): """ Read Records weights from file or use specified DataFrame as data or create empty DataFrame if None. """ if weights is None: WT = pd.DataFrame({'nothing': []}) setattr(self, 'WT', WT) return if isinstance(weights, pd.DataFrame): WT = weights elif isinstance(weights, six.string_types): if os.path.isfile(weights): # pylint: disable=redefined-variable-type # (above because pylint mistakenly thinks WT not a DataFrame) WT = pd.read_csv(weights) else: WT = read_egg_csv(Records.WEIGHTS_FILENAME) else: msg = 'weights is not None or a string or a Pandas DataFrame' raise ValueError(msg) assert isinstance(WT, pd.DataFrame) setattr(self, 'WT', WT)
def _read_benefits(self, benefits): """ Read Records extrapolated benefits from a file or uses a specified DataFrame or creates an empty DataFrame if None. Should only be used with the cps.csv file """ if benefits is None: setattr(self, 'BEN', pd.DataFrame({'Nothing': []})) return if isinstance(benefits, pd.DataFrame): BEN_partial = benefits elif isinstance(benefits, six.string_types): benefits_path = os.path.join(Records.CUR_PATH, benefits) if os.path.isfile(benefits_path): BEN_partial = pd.read_csv(benefits_path) else: # cannot call read_egg_ function in unit tests b_path = os.path.basename(benefits_path) # pragma: no cover BEN_partial = read_egg_csv(b_path) # pragma: no cover else: msg = 'benefits is not None or a string or a Pandas DataFrame' raise ValueError(msg) assert isinstance(BEN_partial, pd.DataFrame) # expand benefits DataFrame to include those who don't receive benefits recid_df = pd.DataFrame({'RECID': self.RECID}) # merge benefits with DataFrame of RECID full_df = recid_df.merge(BEN_partial, on='RECID', how='left') # fill missing values full_df.fillna(0, inplace=True) assert len(recid_df.index) == len(full_df.index) self.BEN = pd.DataFrame() setattr(self, 'BEN', full_df.astype(np.float32)) # delete intermediate DataFrame objects del full_df del recid_df del BEN_partial
def test_read_egg_csv(): with pytest.raises(ValueError): read_egg_csv('bad_filename')
def _read_data(self, data, exact_calcs): """ Read Records data from file or use specified DataFrame as data. Specifies exact array depending on boolean value of exact_calcs. """ # pylint: disable=too-many-statements,too-many-branches if Records.INTEGER_VARS == set(): Records.read_var_info() # read specified data if isinstance(data, pd.DataFrame): taxdf = data elif isinstance(data, str): if os.path.isfile(data): taxdf = pd.read_csv(data) else: # cannot call read_egg_ function in unit tests taxdf = read_egg_csv(data) # pragma: no cover else: msg = 'data is neither a string nor a Pandas DataFrame' raise ValueError(msg) self.__dim = len(taxdf.index) self.__index = taxdf.index # create class variables using taxdf column names READ_VARS = set() self.IGNORED_VARS = set() for varname in list(taxdf.columns.values): if varname in Records.USABLE_READ_VARS: READ_VARS.add(varname) if varname in Records.INTEGER_READ_VARS: setattr(self, varname, taxdf[varname].astype(np.int32).values) else: setattr(self, varname, taxdf[varname].astype(np.float64).values) else: self.IGNORED_VARS.add(varname) # check that MUST_READ_VARS are all present in taxdf if not Records.MUST_READ_VARS.issubset(READ_VARS): msg = 'Records data missing one or more MUST_READ_VARS' raise ValueError(msg) # delete intermediate taxdf object del taxdf # create other class variables that are set to all zeros UNREAD_VARS = Records.USABLE_READ_VARS - READ_VARS ZEROED_VARS = Records.CALCULATED_VARS | UNREAD_VARS for varname in ZEROED_VARS: if varname in Records.INTEGER_VARS: setattr(self, varname, np.zeros(self.array_length, dtype=np.int32)) else: setattr(self, varname, np.zeros(self.array_length, dtype=np.float64)) # check for valid MARS values if not np.all(np.logical_and(np.greater_equal(self.MARS, 1), np.less_equal(self.MARS, 5))): raise ValueError('not all MARS values in [1,5] range') # create variables derived from MARS, which is in MUST_READ_VARS self.num[:] = np.where(self.MARS == 2, 2, 1) self.sep[:] = np.where(self.MARS == 3, 2, 1) # check for valid EIC values if not np.all(np.logical_and(np.greater_equal(self.EIC, 0), np.less_equal(self.EIC, 3))): raise ValueError('not all EIC values in [0,3] range') # specify value of exact array self.exact[:] = np.where(exact_calcs is True, 1, 0) # delete intermediate variables del READ_VARS del UNREAD_VARS del ZEROED_VARS
def calculate(year_n, start_year, use_puf_not_cps, use_full_sample, user_mods, behavior_allowed): """ The calculate function assumes the specified user_mods is a dictionary returned by the Calculator.read_json_param_objects() function. The function returns (calc1, calc2) where calc1 is pre-reform Calculator object calculated for year_n, and calc2 is post-reform Calculator object calculated for year_n. Set behavior_allowed to False when generating static results or set behavior_allowed to True when generating dynamic results. """ # pylint: disable=too-many-arguments,too-many-locals # pylint: disable=too-many-branches,too-many-statements check_user_mods(user_mods) # specify Consumption instance consump = Consumption() consump_assumptions = user_mods['consumption'] consump.update_consumption(consump_assumptions) # specify growdiff_baseline and growdiff_response growdiff_baseline = GrowDiff() growdiff_response = GrowDiff() growdiff_base_assumps = user_mods['growdiff_baseline'] growdiff_resp_assumps = user_mods['growdiff_response'] growdiff_baseline.update_growdiff(growdiff_base_assumps) growdiff_response.update_growdiff(growdiff_resp_assumps) # create pre-reform and post-reform GrowFactors instances growfactors_pre = GrowFactors() growdiff_baseline.apply_to(growfactors_pre) growfactors_post = GrowFactors() growdiff_baseline.apply_to(growfactors_post) growdiff_response.apply_to(growfactors_post) # create sample pd.DataFrame from specified input file and sampling scheme stime = time.time() tbi_path = os.path.abspath(os.path.dirname(__file__)) if use_puf_not_cps: # first try TaxBrain deployment path input_path = 'puf.csv.gz' if not os.path.isfile(input_path): # otherwise try local Tax-Calculator deployment path input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') sampling_frac = 0.05 sampling_seed = 180 else: # if using cps input not puf input # first try Tax-Calculator code path input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') if not os.path.isfile(input_path): # otherwise read from taxcalc package "egg" input_path = None # pragma: no cover full_sample = read_egg_csv('cps.csv.gz') # pragma: no cover sampling_frac = 0.03 sampling_seed = 180 if input_path: full_sample = pd.read_csv(input_path) if use_full_sample: sample = full_sample else: sample = full_sample.sample( # pylint: disable=no-member frac=sampling_frac, random_state=sampling_seed) if use_puf_not_cps: print('puf-read-time= {:.1f}'.format(time.time() - stime)) else: print('cps-read-time= {:.1f}'.format(time.time() - stime)) # create pre-reform Calculator instance if use_puf_not_cps: recs1 = Records(data=sample, gfactors=growfactors_pre) else: recs1 = Records.cps_constructor(data=sample, gfactors=growfactors_pre) policy1 = Policy(gfactors=growfactors_pre) calc1 = Calculator(policy=policy1, records=recs1, consumption=consump) while calc1.current_year < start_year: calc1.increment_year() calc1.calc_all() assert calc1.current_year == start_year # specify Behavior instance behv = Behavior() behavior_assumps = user_mods['behavior'] behv.update_behavior(behavior_assumps) # always prevent both behavioral response and growdiff response if behv.has_any_response() and growdiff_response.has_any_response(): msg = 'BOTH behavior AND growdiff_response HAVE RESPONSE' raise ValueError(msg) # optionally prevent behavioral response if behv.has_any_response() and not behavior_allowed: msg = 'A behavior RESPONSE IS NOT ALLOWED' raise ValueError(msg) # create post-reform Calculator instance if use_puf_not_cps: recs2 = Records(data=sample, gfactors=growfactors_post) else: recs2 = Records.cps_constructor(data=sample, gfactors=growfactors_post) policy2 = Policy(gfactors=growfactors_post) policy_reform = user_mods['policy'] policy2.implement_reform(policy_reform) calc2 = Calculator(policy=policy2, records=recs2, consumption=consump, behavior=behv) while calc2.current_year < start_year: calc2.increment_year() assert calc2.current_year == start_year # delete objects now embedded in calc1 and calc2 del sample del full_sample del consump del growdiff_baseline del growdiff_response del growfactors_pre del growfactors_post del behv del recs1 del recs2 del policy1 del policy2 # increment Calculator objects for year_n years and calculate for _ in range(0, year_n): calc1.increment_year() calc2.increment_year() calc1.calc_all() if calc2.behavior_has_response(): calc2 = Behavior.response(calc1, calc2) else: calc2.calc_all() # return calculated Calculator objects return (calc1, calc2)
def calculate(year_n, start_year, use_puf_not_cps, use_full_sample, user_mods, behavior_allowed): """ The calculate function assumes the specified user_mods is a dictionary returned by the Calculator.read_json_param_objects() function. The function returns (calc1, calc2, mask) where calc1 is pre-reform Calculator object calculated for year_n, calc2 is post-reform Calculator object calculated for year_n, and mask is boolean array marking records with reform-induced iitax diffs Set behavior_allowed to False when generating static results or set behavior_allowed to True when generating dynamic results. """ # pylint: disable=too-many-arguments,too-many-locals # pylint: disable=too-many-branches,too-many-statements check_user_mods(user_mods) # specify Consumption instance consump = Consumption() consump_assumptions = user_mods['consumption'] consump.update_consumption(consump_assumptions) # specify growdiff_baseline and growdiff_response growdiff_baseline = Growdiff() growdiff_response = Growdiff() growdiff_base_assumps = user_mods['growdiff_baseline'] growdiff_resp_assumps = user_mods['growdiff_response'] growdiff_baseline.update_growdiff(growdiff_base_assumps) growdiff_response.update_growdiff(growdiff_resp_assumps) # create pre-reform and post-reform Growfactors instances growfactors_pre = Growfactors() growdiff_baseline.apply_to(growfactors_pre) growfactors_post = Growfactors() growdiff_baseline.apply_to(growfactors_post) growdiff_response.apply_to(growfactors_post) # create sample pd.DataFrame from specified input file and sampling scheme stime = time.time() tbi_path = os.path.abspath(os.path.dirname(__file__)) if use_puf_not_cps: # first try TaxBrain deployment path input_path = 'puf.csv.gz' if not os.path.isfile(input_path): # otherwise try local Tax-Calculator deployment path input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') sampling_frac = 0.05 sampling_seed = 180 else: # if using cps input not puf input # first try Tax-Calculator code path input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') if not os.path.isfile(input_path): # otherwise read from taxcalc package "egg" input_path = None # pragma: no cover full_sample = read_egg_csv('cps.csv.gz') # pragma: no cover sampling_frac = 0.03 sampling_seed = 180 if input_path: full_sample = pd.read_csv(input_path) if use_full_sample: sample = full_sample else: sample = full_sample.sample( # pylint: disable=no-member frac=sampling_frac, random_state=sampling_seed) if use_puf_not_cps: print('puf-read-time= {:.1f}'.format(time.time() - stime)) else: print('cps-read-time= {:.1f}'.format(time.time() - stime)) # create pre-reform Calculator instance if use_puf_not_cps: recs1 = Records(data=copy.deepcopy(sample), gfactors=growfactors_pre) else: recs1 = Records.cps_constructor(data=copy.deepcopy(sample), gfactors=growfactors_pre) policy1 = Policy(gfactors=growfactors_pre) calc1 = Calculator(policy=policy1, records=recs1, consumption=consump) while calc1.current_year < start_year: calc1.increment_year() calc1.calc_all() assert calc1.current_year == start_year # compute mask array res1 = calc1.dataframe(DIST_VARIABLES) if use_puf_not_cps: # create pre-reform Calculator instance with extra income recs1p = Records(data=copy.deepcopy(sample), gfactors=growfactors_pre) # add one dollar to the income of each filing unit to determine # which filing units undergo a resulting change in tax liability recs1p.e00200 += 1.0 # pylint: disable=no-member recs1p.e00200p += 1.0 # pylint: disable=no-member policy1p = Policy(gfactors=growfactors_pre) # create Calculator with recs1p and calculate for start_year calc1p = Calculator(policy=policy1p, records=recs1p, consumption=consump) while calc1p.current_year < start_year: calc1p.increment_year() calc1p.calc_all() assert calc1p.current_year == start_year # compute mask showing which of the calc1 and calc1p results differ; # mask is true if a filing unit's income tax liability changed after # a dollar was added to the filing unit's wage and salary income res1p = calc1p.dataframe(DIST_VARIABLES) mask = np.logical_not( # pylint: disable=no-member np.isclose(res1.iitax, res1p.iitax, atol=0.001, rtol=0.0)) assert np.any(mask) else: # if use_cps_not_cps is False # indicate that no fuzzing of reform results is required mask = np.zeros(res1.shape[0], dtype=np.int8) # specify Behavior instance behv = Behavior() behavior_assumps = user_mods['behavior'] behv.update_behavior(behavior_assumps) # always prevent both behavioral response and growdiff response if behv.has_any_response() and growdiff_response.has_any_response(): msg = 'BOTH behavior AND growdiff_response HAVE RESPONSE' raise ValueError(msg) # optionally prevent behavioral response if behv.has_any_response() and not behavior_allowed: msg = 'A behavior RESPONSE IS NOT ALLOWED' raise ValueError(msg) # create post-reform Calculator instance if use_puf_not_cps: recs2 = Records(data=copy.deepcopy(sample), gfactors=growfactors_post) else: recs2 = Records.cps_constructor(data=copy.deepcopy(sample), gfactors=growfactors_post) policy2 = Policy(gfactors=growfactors_post) policy_reform = user_mods['policy'] policy2.implement_reform(policy_reform) calc2 = Calculator(policy=policy2, records=recs2, consumption=consump, behavior=behv) while calc2.current_year < start_year: calc2.increment_year() calc2.calc_all() assert calc2.current_year == start_year # increment Calculator objects for year_n years and calculate for _ in range(0, year_n): calc1.increment_year() calc2.increment_year() calc1.calc_all() if calc2.behavior_has_response(): calc2 = Behavior.response(calc1, calc2) else: calc2.calc_all() # return calculated Calculator objects and mask return (calc1, calc2, mask)