def main(): # naics_tree = naics.load_naics(naics_codes_file) # read_wages.load_nipa_wages_ind(data_folder, naics_tree) # parameters = [read_wages.WAGES] # naics.pop_back(naics_tree, parameters) naics.pop_forward(naics_tree, parameters, None, None, None, True) # naics.print_tree_dfs(naics_tree, output_folder)
def calc_depr_rates(asset_tree, inv_tree, land_tree, data_folder): # The directory with depreciation rates data: depr_folder = os.path.abspath(data_folder + "\\Depreciation Rates") # Opening file containing depreciation rates by asset type: depr_econ = pd.read_csv(os.path.abspath(depr_folder + "\\Economic Depreciation Rates.csv")) depr_econ = depr_econ.fillna(1) econ_assets = depr_econ["Asset"] econ_rates = depr_econ["Economic Depreciation Rate"] # types = ["All", "Corp", "Non-Corp"] # Initialize tree for depreciation rates: depr_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") for i in depr_tree.enum_inds: i.data.append(("Economic", pd.DataFrame(np.zeros((1, 3)), columns=types))) # for i in types: asset_list = asset_tree.enum_inds[0].data.dfs[i].columns asset_list = asset_list.values.tolist() match = np.array([-1] * len(asset_list)) for j in xrange(0, asset_tree.enum_inds[0].data.dfs[i].shape[1]): for k in xrange(0, len(econ_assets)): if str(asset_list[j]).strip() == str(econ_assets[k]).strip(): match[j] = k for j in xrange(0, len(depr_tree.enum_inds)): cur_sum = 0 asset_df = asset_tree.enum_inds[j].data.dfs[i] depr_df = depr_tree.enum_inds[j].data.dfs["Economic"] for k in xrange(0, len(asset_list)): if match[k] == -1: print k continue cur_sum += asset_df.iloc[0, k] * econ_rates[match[k]] if sum(asset_df.iloc[0, :]) != 0: depr_df[i][0] = cur_sum / sum(asset_df.iloc[0, :]) else: depr_df[i][0] = 0 # Inventories and land have an approximately zero depreciation rate: for j in xrange(0, len(depr_tree.enum_inds)): tot_assets = sum(asset_tree.enum_inds[j].data.dfs["All"].iloc[0, :]) tot_inv = inv_tree.enum_inds[j].data.dfs["Inventories"]["All"][0] tot_land = land_tree.enum_inds[j].data.dfs["Land"]["All"][0] if tot_assets + tot_inv + tot_land == 0: continue ratio = tot_assets / (tot_assets + tot_inv + tot_land) # cur_df = depr_tree.enum_inds[j].data.dfs["Economic"] cur_df[i][0] = ratio * cur_df[i][0] return depr_tree
def get_incs(): # naics_tree = naics.load_naics(naics_codes_file) # read_inc.load_nipa_inc_ind(data_folder, naics_tree) read_inc.load_nipa_int_ind(data_folder, naics_tree) read_inc.calc_bus_inc(naics_tree) # parameters = [read_inc.BUS_INC, read_inc.INT_INC, read_inc.FIN_INC] # naics.pop_back(naics_tree, parameters) naics.pop_forward(naics_tree, parameters) # naics.print_tree_dfs(naics_tree, output_folder) return naics_tree
def test_load_naics(path=None, messages=True): # Default path if none is specified: if path == None: path = os.getcwd() path = os.path.abspath(path + "\\data\\2012_NAICS_Codes.csv") # Using the function being tested to create a tree: cur_tree = naics.load_naics(path) # Replicating the codes in the input file: rep_codes = np.zeros(0) for ind in cur_tree.enum_inds: cur_codes = ind.data.dfs["Codes:"].iloc[:, 0] rep_codes = np.append(rep_codes, cur_codes) rep_codes = rep_codes.astype(int) rep_codes = np.unique(rep_codes) rep_codes = np.sort(rep_codes) # orig_data = pd.read_csv(path).iloc[:, 0] orig_codes = np.zeros(0) for i in xrange(0, len(orig_data)): cur_codes = str(orig_data[i]).split("-") orig_codes = np.append(orig_codes, cur_codes) orig_codes = orig_codes.astype(int) orig_codes = np.unique(orig_codes) orig_codes = np.sort(orig_codes) # rep_index = 0 orig_index = 0 matches = 0 while ((rep_index < len(rep_codes)) and (orig_index < len(orig_codes))): if (rep_codes[rep_index] == int(orig_codes[orig_index])): rep_index += 1 orig_index += 1 matches += 1 elif (rep_codes[rep_index] <= orig_codes[orig_index]): rep_index += 1 elif (rep_codes[rep_index] >= orig_codes[orig_index]): orig_index += 1 if matches == len(orig_codes): if messages: print "\"load_naics\" passed test 1." return None else: mismatch = str(len(orig_codes) - matches) if messages: print "\"load_naics\" failed test 1. Mismatches:" + mismatch + "." return int(mismatch)
def test_load_naics(path = None, messages = True): # Default path if none is specified: if path == None: path = os.getcwd() path = os.path.abspath(path + "\\data\\2012_NAICS_Codes.csv") # Using the function being tested to create a tree: cur_tree = naics.load_naics(path) # Replicating the codes in the input file: rep_codes = np.zeros(0) for ind in cur_tree.enum_inds: cur_codes = ind.data.dfs["Codes:"].iloc[:,0] rep_codes = np.append(rep_codes, cur_codes) rep_codes = rep_codes.astype(int) rep_codes = np.unique(rep_codes) rep_codes = np.sort(rep_codes) # orig_data = pd.read_csv(path).iloc[:,0] orig_codes = np.zeros(0) for i in xrange(0, len(orig_data)): cur_codes = str(orig_data[i]).split("-") orig_codes = np.append(orig_codes, cur_codes) orig_codes = orig_codes.astype(int) orig_codes = np.unique(orig_codes) orig_codes = np.sort(orig_codes) # rep_index = 0 orig_index = 0 matches = 0 while((rep_index < len(rep_codes)) and (orig_index < len(orig_codes))): if(rep_codes[rep_index] == int(orig_codes[orig_index])): rep_index += 1 orig_index += 1 matches += 1 elif(rep_codes[rep_index] <= orig_codes[orig_index]): rep_index += 1 elif(rep_codes[rep_index] >= orig_codes[orig_index]): orig_index += 1 if matches == len(orig_codes): if messages: print "\"load_naics\" passed test 1." return None else: mismatch = str(len(orig_codes) - matches) if messages: print "\"load_naics\" failed test 1. Mismatches:" + mismatch + "." return int(mismatch)
def read_land(output_tree, data_folder): land_folder = os.path.abspath(data_folder + "\\Land") land_file = os.path.abspath(land_folder + "\\Fin_Accounts-Land.csv") land_data = pd.read_csv(land_file) # Data is in billions: land_data = (10 ** 9) * land_data corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = [ "S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors", ] land_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") df_cols = ["All", "Corp", "Non-Corp"] for i in land_tree.enum_inds: i.data.append(("Land", pd.DataFrame(np.zeros((1, len(df_cols))), columns=df_cols))) corp_sum = 0.0 non_corp_sum = 0.0 for i in corp_types: corp_sum += output_tree.enum_inds[0].data.dfs["LAND"][i][0] for i in non_corp_types: non_corp_sum += output_tree.enum_inds[0].data.dfs["LAND"][i][0] if corp_sum + non_corp_sum == 0: return land_tree # corp_proportion = corp_sum / (corp_sum + non_corp_sum) # non_corp_proportion = non_corp_sum / (corp_sum + non_corp_sum) land_df = land_tree.enum_inds[0].data.dfs["Land"] land_df["Corp"][0] = land_data["Corporate"][0] land_df["Non-Corp"][0] = land_data["Non-Corporate"][0] land_df["All"][0] = land_data["Corporate"][0] + land_data["Non-Corporate"][0] return land_tree
def summary_tree(data_tree, data_folder): all_sectors = ["C Corporations", "S Corporations", "Corporate general partners", "Corporate limited partners", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] pa_types = data_tree.enum_inds[0].data.dfs["PA_types"].columns pa_types = pa_types.values.tolist() # output_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # for i in output_tree.enum_inds: i.append_dfs(("FA",pd.DataFrame(np.zeros((1, len(all_sectors))), columns = all_sectors))) i.append_dfs(("INV",pd.DataFrame(np.zeros((1, len(all_sectors))), columns = all_sectors))) i.append_dfs(("LAND",pd.DataFrame(np.zeros((1, len(all_sectors))), columns = all_sectors))) # for i in range(0, len(output_tree.enum_inds)): # #cur_data = data_tree.enum_inds[i].data #out_data = output_tree.enum_inds[i].data cur_dfs = data_tree.enum_inds[i].data.dfs out_dfs = output_tree.enum_inds[i].data.dfs partner_sum = sum(cur_dfs["PA_types"].iloc[0,:]) # for j in range(0, len(all_sectors)): sector = all_sectors[j] # if sector == "C Corporations": cur_df = cur_dfs["c_corps"] out_dfs["FA"][sector][0] = cur_df["Depreciable Assets"][0] out_dfs["INV"][sector][0] = cur_df["Inventories"][0] out_dfs["LAND"][sector][0] = cur_df["Land"][0] elif sector == "S Corporations": cur_df = cur_dfs["s_corps"] out_dfs["FA"][sector][0] = cur_df["Depreciable Assets"][0] out_dfs["INV"][sector][0] = cur_df["Inventories"][0] out_dfs["LAND"][sector][0] = cur_df["Land"][0] elif sector in pa_types: if partner_sum != 0: ratio = abs(cur_dfs["PA_types"][sector][0])/partner_sum else: ratio = abs(1.0/cur_dfs["PA_types"].shape[0]) cur_df = cur_dfs["PA_assets"] out_dfs["FA"][sector][0] = abs( ratio*cur_df["Depreciable assets (Net)"][0] ) out_dfs["INV"][sector][0] = abs( ratio*cur_df["Inventories (Net)"][0] ) out_dfs["LAND"][sector][0] = abs( ratio*cur_df["Land (Net)"][0] ) elif sector == "Sole Proprietors": if cur_dfs["PA_inc_loss"]["Depreciation"][0] != 0: ratio = abs(cur_dfs["soi_prop"]["Depr Deductions"][0]/ cur_dfs["PA_inc_loss"]["Depreciation"][0]) else: ratio = 0.0 cur_df = cur_dfs["PA_assets"] out_dfs["FA"][sector][0] = abs( (ratio* cur_df["Depreciable assets (Net)"][0])+ cur_dfs["farm_prop"]["FA"][0] ) out_dfs["INV"][sector][0] = abs( (ratio*cur_df["Inventories (Net)"][0])+ cur_dfs["farm_prop"]["Land"][0] ) out_dfs["LAND"][sector][0] = abs(ratio*cur_df["Land (Net)"][0]) return output_tree
------------------------------------------------------------------------------- The main script of the program: --Loading the SOI Tax Stats-Corporation Data. --Loading the SOI Tax Stats-Partnership Data. --Loading tax data for Proprietorships. --Creating "output_tree" stating FA, INV, and LAND for various sectors. ------------------------------------------------------------------------------- ''' # Working directory: path = os.getcwd() # Relevant path and file names: data_folder = os.path.abspath(path + "\\data") output_folder = os.path.abspath(path + "\\OUTPUT") # Create a tree based off NAICS Codes: data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # Reading in the SOI Tax Stats-Corporation Data: naics.load_soi_corporate_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Partnership Data: naics.load_soi_partner_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Proprietorship Data: naics.load_soi_proprietor_data(data_tree, data_folder) ''' Many industries are not listed in the SOI datasets. The data for these missing industries are interpolated. ''' # Get a list of the names of all the pd dfs besides the list of codes: cur_names = data_tree.enum_inds[0].data.dfs.keys() cur_names.remove("Codes:") # Populate missing industry data backwards throught the tree:
def calc_tax_depr_rates(asset_tree, inv_tree, land_tree, data_folder): # The directory with depreciation rates data: depr_folder = os.path.abspath(data_folder + "\\Depreciation Rates") # tax_file = os.path.abspath(depr_folder + "\\BEA_IRS_Crosswalk.csv") tax_data = pd.read_csv(tax_file).fillna(0) tax_assets = tax_data["Asset Type"] for i in xrange(0, len(tax_assets)): tax_assets[i] = str(tax_assets[i]).replace("\xa0", " ").strip() # r = 0.05 # # tax_cols = {"GDS 200%": 2, "GDS 150%": 1.5, "GDS SL": 1.0, "ADS SL": 1.0} tax_gds_mthds = {"GDS 200%": 2.0, "GDS 150%": 1.5, "GDS SL": 1.0} tax_ads_mthds = {"ADS SL": 1.0} tax_cols = tax_gds_mthds.keys() + tax_ads_mthds.keys() tax_systems = {"GDS": tax_gds_mthds, "ADS": tax_ads_mthds} tax_rates = pd.DataFrame(np.zeros((len(tax_assets), len(tax_cols))), columns=tax_cols) tax_rates["Asset"] = tax_assets # Compute the tax rates: for i in tax_systems: tax_yrs = tax_data[i] for j in tax_systems[i]: tax_b = tax_systems[i][j] tax_beta = tax_b / tax_yrs tax_star = tax_yrs * (1 - (1 / tax_b)) # tax_z = (((tax_beta/(tax_beta+r))* # (1-np.exp(-1*(tax_beta+r)*tax_star)))+ # ((np.exp(-1*tax_beta*tax_star)* # np.exp(-1*r*tax_star)-np.exp(-1*r*tax_yrs))/ # ((tax_yrs-tax_star)*r))) tax_z = ((tax_beta / (tax_beta + r)) * (1 - np.exp(-1 * (tax_beta + r) * tax_star))) + ( (np.exp(-1 * tax_beta * tax_star) / ((tax_yrs - tax_star) * r)) * (np.exp(-1 * r * tax_star) - np.exp(-1 * r * tax_yrs)) ) tax_rates[j] = r / ((1 / tax_z) - 1) tax_rates = tax_rates.fillna(0) # types = ["All", "Corp", "Non-Corp"] # Initialize tree for depreciation rates: depr_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") for i in depr_tree.enum_inds: for j in tax_systems: for k in tax_systems[j]: i.data.append((k, pd.DataFrame(np.zeros((1, 3)), columns=types))) for i in depr_tree.enum_inds: i.data.append(("Recommended", pd.DataFrame(np.zeros((1, 3)), columns=types))) # for i in types: asset_list = asset_tree.enum_inds[0].data.dfs[i].columns asset_list = asset_list.values.tolist() match = np.array([-1] * len(asset_list)) for j in xrange(0, asset_tree.enum_inds[0].data.dfs[i].shape[1]): for k in xrange(0, len(tax_assets)): if str(asset_list[j]).strip() == str(tax_assets[k]).strip(): match[j] = k for j in xrange(0, len(depr_tree.enum_inds)): cur_ind = depr_tree.enum_inds[j] asset_df = asset_tree.enum_inds[j].data.dfs[i] # tot_assets = sum(asset_tree.enum_inds[j].data.dfs[i].iloc[0, :]) tot_inv = inv_tree.enum_inds[j].data.dfs["Inventories"][i][0] tot_land = land_tree.enum_inds[j].data.dfs["Land"][i][0] if tot_assets + tot_inv + tot_land == 0: continue ratio = tot_assets / (tot_assets + tot_inv + tot_land) # for k in tax_cols: cur_tax = cur_ind.data.dfs[k][i] cur_sum = 0.0 for l in xrange(0, len(asset_list)): if match[l] == -1: continue cur_sum += asset_df.iloc[0, l] * tax_rates[k][match[l]] cur_tax[0] = ratio * (cur_sum / sum(asset_df.iloc[0, :])) # cur_tax = cur_ind.data.dfs["Recommended"][i] cur_sum = 0 for l in xrange(0, len(asset_list)): if match[l] == -1: continue cur_rate = tax_rates[tax_data["Method"][match[l]]][match[l]] cur_sum += asset_df.iloc[0, l] * cur_rate cur_tax[0] = ratio * (cur_sum / sum(asset_df.iloc[0, :])) return depr_tree
def read_bea(output_tree, data_folder): # The directory with BEA data: bea_folder = os.path.abspath(data_folder + "\\BEA") # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(os.path.abspath( bea_folder + "\\detailnonres_stk1.xlsx")) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if(sht_pos == [-1,-1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True) sht_pos[1] = sht_pos[1] - 1 if(sht_pos == [-1,-1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts-1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index-1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col+1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.abspath(bea_folder + "\\detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ") asset_list.iloc[i,0] = asset_list.iloc[i,0].strip() # Reading in the corresponding naics codes: naics_file = os.path.abspath(bea_folder + "\\detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry","BEA Code","NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object), columns = chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 for i in range(0, num_shts-2): for cur_row in range(sht_pos[0]+1, bea_readme.nrows): bea_code = str(bea_readme.cell_value(cur_row,cur_col+1)) if(str(bea_codes1[i]) == bea_code): bea_ind = str(bea_readme.cell_value(cur_row,cur_col)) bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # If they match except one has ".0" at the end: elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]): bea_ind = str(bea_readme.cell_value(cur_row, cur_col)) bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros((asset_list.shape[0], bea_chart.shape[0])), columns = bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j,0] for k in xrange(sht_pos[0]+2, cur_sht.nrows): cur_cell = str(cur_sht.cell_value(k, sht_pos[1]+1)) cur_cell = cur_cell.replace("\xa0", " ").strip() if(cur_asset == cur_cell): bea_table[i][j] = float( cur_sht.cell_value(k, cur_sht.ncols-1) ) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * 1000000 # Breaking down by corporate tax status: corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = ["S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] # Initialize tree for assets data: asset_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") for i in xrange(0, len(asset_tree.enum_inds)): asset_tree.enum_inds[i].data.append(("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) asset_tree.enum_inds[i].data.append(("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) asset_tree.enum_inds[i].data.append(("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) # Fill in data from BEA's fixed asset table: enum_index = len(output_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, output_tree, "FA").iloc[1,:] corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", corp_types).iloc[1,:] non_corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", non_corp_types).iloc[1,:] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(asset_tree.enum_inds)): enum_index = (enum_index+1) % len(asset_tree.enum_inds) out_dfs = output_tree.enum_inds[enum_index].data.dfs if(sum(out_dfs["FA"].iloc[0,:]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in corp_types: corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) for category in non_corp_types: non_corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) cur_data = asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:,0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if(share == 0): continue num_assets = asset_tree.enum_inds[0].data.dfs["All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]* all_ratio* all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]* corp_ratio* corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]* non_corp_ratio* non_corp_proportions[code_index]) break if(tot_share == 1): break return asset_tree
------------------------------------------------------------------------------- The main script of the program: --Loading the SOI Tax Stats-Corporation Data. --Loading the SOI Tax Stats-Partnership Data. --Loading tax data for Proprietorships. --Creating "output_tree" stating FA, INV, and LAND for various sectors. ------------------------------------------------------------------------------- ''' # Working directory: path = os.getcwd() # Relevant path and file names: data_folder = os.path.abspath(path + "\\data") output_folder = os.path.abspath(path + "\\OUTPUT") # Create a tree based off NAICS Codes: data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # Reading in the SOI Tax Stats-Corporation Data: naics.load_soi_corporate_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Partnership Data: naics.load_soi_partner_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Proprietorship Data: naics.load_soi_proprietor_data(data_tree, data_folder) ''' Many industries are not listed in the SOI datasets. The data for these missing industries are interpolated. ''' # Get a list of the names of all the pd dfs besides the list of codes: cur_names = data_tree.enum_inds[0].data.dfs.keys() cur_names.remove("Codes:") # Populate missing industry data backwards throught the tree: naics.pop_back(data_tree, cur_names)
def read_inventories(output_tree, data_folder): # The directory with inventory data: inv_folder = os.path.abspath(data_folder + "\\Inventories") # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(os.path.abspath( inv_folder + "\\Inventories.xls")) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True) check_index = naics.search_ws(sht0, "line", 20) if(cur_index[1] != check_index[1]): print "ERROR" # Breaking down by corporate tax status: corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = ["S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] # Reading in the crosswalk: inv_cross = pd.read_csv(os.path.abspath( inv_folder + "\\Inventories_Crosswalk.csv")) # Creating a tree for the inventory data: inv_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # data_cols = ["All", "Corp", "Non-Corp"] for i in inv_tree.enum_inds: i.data.append(("Inventories", pd.DataFrame(np.zeros((1, len(data_cols))), columns = data_cols))) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if(cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if(checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols-1)) except ValueError: continue inv_data[cross_index-1] = cur_value # Data is in billions: inv_data[cross_index-1] = (10**9) * inv_data[cross_index-1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, output_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])] prev_ind = output_tree.enum_inds[int(proportions.iloc[0,j])] prev_df = prev_ind.data.dfs["INV"] if(sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) * (inv_data[i] * proportions.iloc[1,j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0,:]) for k in corp_types: inv_df["Corp"] += cur_dfs[k][0] for k in non_corp_types: inv_df["Non-Corp"] += cur_dfs[k][0] # return inv_tree