def load_soi_farm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # data_tree.append_all(df_nm=_FARM_DF_NM, df_cols=new_farm_cols) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0]) / farm_data["A_p"][0])) total = farm_data["R_p"][0] + farm_data["Q_p"][0] total_pa = 0 cur_codes = [111, 112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, _AST_PRT_DF_NM, [_LAND_COL_NM, _DEPR_COL_NM]) # for ind_code in cur_codes: cur_ind = naics.find_naics(data_tree, ind_code) cur_df = cur_ind.data.dfs[_AST_PRT_DF_NM] total_pa += (cur_df[_LAND_COL_NM][0] + cur_df[_DEPR_COL_NM][0]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = ( land_mult * cur_ind.data.dfs[_AST_PRT_DF_NM][_LAND_COL_NM][0] / total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ( (proportions.iloc[1, i] * total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[ 0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def read_inventories(asset_tree): # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(_INV_IN_PATH) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0, 0], True) check_index = naics.search_ws(sht0, "line", 20) if (cur_index[1] != check_index[1]): print "ERROR" # Reading in the crosswalk: inv_cross = pd.read_csv(_INV_IN_CROSS_PATH) # Creating a tree for the inventory data: data_cols = ["All", "Corp", "Non-Corp"] inv_tree = naics.generate_tree() inv_tree.append_all(df_nm="Inventories", df_cols=data_cols) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if (cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1] + 1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if (checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols - 1)) except ValueError: continue inv_data[cross_index - 1] = cur_value # Data is in billions: inv_data[cross_index - 1] = _INV_IN_FILE_FCTR * inv_data[cross_index - 1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, asset_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0, j])] prev_ind = asset_tree.enum_inds[int(proportions.iloc[0, j])] prev_df = prev_ind.data.dfs["INV"] if (sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df / sum(prev_df.iloc[0, :])) * (inv_data[i] * proportions.iloc[1, j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0, :]) for k in _CORP_NMS: inv_df["Corp"] += cur_dfs[k][0] for k in _NCORP_NMS: inv_df["Non-Corp"] += cur_dfs[k][0] # naics.pop_back(inv_tree, ["Inventories"]) naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree) return inv_tree
def read_inventories(asset_tree): # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(_INV_IN_PATH) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True) check_index = naics.search_ws(sht0, "line", 20) if(cur_index[1] != check_index[1]): print "ERROR" # Reading in the crosswalk: inv_cross = pd.read_csv(_INV_IN_CROSS_PATH) # Creating a tree for the inventory data: data_cols = ["All", "Corp", "Non-Corp"] inv_tree = naics.generate_tree() inv_tree.append_all(df_nm="Inventories", df_cols=data_cols) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if(cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if(checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols-1)) except ValueError: continue inv_data[cross_index-1] = cur_value # Data is in billions: inv_data[cross_index-1] = _INV_IN_FILE_FCTR * inv_data[cross_index-1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, asset_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])] prev_ind = asset_tree.enum_inds[int(proportions.iloc[0,j])] prev_df = prev_ind.data.dfs["INV"] if(sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) * (inv_data[i] * proportions.iloc[1,j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0,:]) for k in _CORP_NMS: inv_df["Corp"] += cur_dfs[k][0] for k in _NCORP_NMS: inv_df["Non-Corp"] += cur_dfs[k][0] # naics.pop_back(inv_tree, ["Inventories"]) naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree) return inv_tree
def load_soi_farm_prop(data_tree = naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # for i in data_tree.enum_inds: i.append_dfs((_FARM_DF_NM, pd.DataFrame(np.zeros((1,len(new_farm_cols))), columns=new_farm_cols))) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0])/farm_data["A_p"][0])) total = farm_data.iloc[0,0] + farm_data.iloc[0,2] total_pa = 0 cur_codes = [111,112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", ["Land (Net)","Depreciable assets (Net)"]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_df = cur_ind.data.dfs["PA_assets"] total_pa += (cur_df["Land (Net)"][0] + cur_df["Depreciable assets (Net)"][0]) # for i in xrange(0,len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = (land_mult * cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/ total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ((proportions.iloc[1,i]*total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_farm_prop(data_tree = None, blue_tree = None, blueprint = None): # if data_tree == None: data_tree = naics.generate_tree() #Load Farm Proprietorship data: farm_data = pd.read_csv(os.path.abspath(prop_dir + "\\Farm_Data.csv")) new_farm_cols = ["Land", "FA"] # for i in data_tree.enum_inds: i.append_dfs(("farm_prop", pd.DataFrame(np.zeros((1,len(new_farm_cols))), columns=new_farm_cols))) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0])/farm_data["A_p"][0])) total = farm_data.iloc[0,0] + farm_data.iloc[0,2] total_pa = 0 cur_codes = [111,112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", ["Land (Net)","Depreciable assets (Net)"]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_df = cur_ind.data.dfs["PA_assets"] total_pa += (cur_df["Land (Net)"][0] + cur_df["Depreciable assets (Net)"][0]) # for i in xrange(0,len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs["farm_prop"]["Land"][0] = (land_mult * cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/ total_pa) cur_ind.data.dfs["farm_prop"]["FA"][0] = ((proportions.iloc[1,i]*total) - cur_ind.data.dfs["farm_prop"]["Land"][0]) # Default: if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["farm_prop"]) naics.pop_forward(tree=data_tree, df_list=["farm_prop"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def read_bea(output_tree, data_folder): # The directory with BEA data: bea_folder = os.path.abspath(data_folder + "\\BEA") # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(os.path.abspath( bea_folder + "\\detailnonres_stk1.xlsx")) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if(sht_pos == [-1,-1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True) sht_pos[1] = sht_pos[1] - 1 if(sht_pos == [-1,-1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts-1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index-1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col+1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.abspath(bea_folder + "\\detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ") asset_list.iloc[i,0] = asset_list.iloc[i,0].strip() # Reading in the corresponding naics codes: naics_file = os.path.abspath(bea_folder + "\\detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry","BEA Code","NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object), columns = chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 for i in range(0, num_shts-2): for cur_row in range(sht_pos[0]+1, bea_readme.nrows): bea_code = str(bea_readme.cell_value(cur_row,cur_col+1)) if(str(bea_codes1[i]) == bea_code): bea_ind = str(bea_readme.cell_value(cur_row,cur_col)) bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # If they match except one has ".0" at the end: elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]): bea_ind = str(bea_readme.cell_value(cur_row, cur_col)) bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros((asset_list.shape[0], bea_chart.shape[0])), columns = bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j,0] for k in xrange(sht_pos[0]+2, cur_sht.nrows): cur_cell = str(cur_sht.cell_value(k, sht_pos[1]+1)) cur_cell = cur_cell.replace("\xa0", " ").strip() if(cur_asset == cur_cell): bea_table[i][j] = float( cur_sht.cell_value(k, cur_sht.ncols-1) ) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * 1000000 # Breaking down by corporate tax status: corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = ["S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] # Initialize tree for assets data: asset_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") for i in xrange(0, len(asset_tree.enum_inds)): asset_tree.enum_inds[i].data.append(("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) asset_tree.enum_inds[i].data.append(("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) asset_tree.enum_inds[i].data.append(("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) # Fill in data from BEA's fixed asset table: enum_index = len(output_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, output_tree, "FA").iloc[1,:] corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", corp_types).iloc[1,:] non_corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", non_corp_types).iloc[1,:] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(asset_tree.enum_inds)): enum_index = (enum_index+1) % len(asset_tree.enum_inds) out_dfs = output_tree.enum_inds[enum_index].data.dfs if(sum(out_dfs["FA"].iloc[0,:]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in corp_types: corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) for category in non_corp_types: non_corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) cur_data = asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:,0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if(share == 0): continue num_assets = asset_tree.enum_inds[0].data.dfs["All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]* all_ratio* all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]* corp_ratio* corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]* non_corp_ratio* non_corp_proportions[code_index]) break if(tot_share == 1): break return asset_tree
def read_bea(asset_tree): # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(_BEA_ASSET_PATH) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if(sht_pos == [-1,-1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True) sht_pos[1] = sht_pos[1] - 1 if(sht_pos == [-1,-1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): # for rownum in xrange(sh.nrows): #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)]) number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts-1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index-1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col+1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ") asset_list.iloc[i,0] = asset_list.iloc[i,0].strip() # Reading in the corresponding naics codes: naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry","BEA Code","NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object), columns = chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 #for i in range(0, num_shts-2): i = 0 for cur_row in range(sht_pos[0]+1, bea_readme.nrows): bea_code = unicode(bea_readme.cell_value(cur_row,cur_col+1)).encode('utf8') if(str(bea_codes1[i]) == bea_code): bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_ind = bea_ind.replace('\xc2', '').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] i += 1 break # If they match except one has ".0" at the end: elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]): bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_ind = bea_ind.replace('\xc2', '').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] i += 1 break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros((asset_list.shape[0], bea_chart.shape[0])), columns = bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j,0] for k in xrange(sht_pos[0]+2, cur_sht.nrows): cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1]+1)).encode('utf8') cur_cell = cur_cell.replace("\xa0", " ").strip() if(cur_asset == cur_cell): bea_table[i][j] = float(cur_sht.cell_value(k, cur_sht.ncols-1)) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * _BEA_IN_FILE_FCTR # Initialize tree for assets data: fixed_asset_tree = naics.generate_tree() for i in xrange(0, len(fixed_asset_tree.enum_inds)): fixed_asset_tree.enum_inds[i].data.append(("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) fixed_asset_tree.enum_inds[i].data.append(("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) fixed_asset_tree.enum_inds[i].data.append(("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) # Fill in data from BEA's fixed asset table: enum_index = len(asset_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, asset_tree, "FA").iloc[1,:] corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _CORP_NMS).iloc[1,:] non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _NCORP_NMS).iloc[1,:] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(fixed_asset_tree.enum_inds)): enum_index = (enum_index+1) % len(fixed_asset_tree.enum_inds) out_dfs = asset_tree.enum_inds[enum_index].data.dfs if(sum(out_dfs["FA"].iloc[0,:]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in _CORP_NMS: corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) for category in _NCORP_NMS: non_corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0, :])) cur_data = fixed_asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:,0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if(share == 0): continue num_assets = fixed_asset_tree.enum_inds[0].data.dfs["All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]* all_ratio* all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]* corp_ratio* corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]* non_corp_ratio* non_corp_proportions[code_index]) break if(tot_share == 1): break # naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"]) naics.pop_forward(tree=fixed_asset_tree, df_list=["All"], blueprint="FA", blue_tree=asset_tree) naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_CORP_NMS) naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_NCORP_NMS) return fixed_asset_tree
def read_inventories(output_tree, data_folder): # The directory with inventory data: inv_folder = os.path.abspath(data_folder + "\\Inventories") # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(os.path.abspath( inv_folder + "\\Inventories.xls")) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True) check_index = naics.search_ws(sht0, "line", 20) if(cur_index[1] != check_index[1]): print "ERROR" # Breaking down by corporate tax status: corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = ["S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] # Reading in the crosswalk: inv_cross = pd.read_csv(os.path.abspath( inv_folder + "\\Inventories_Crosswalk.csv")) # Creating a tree for the inventory data: inv_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # data_cols = ["All", "Corp", "Non-Corp"] for i in inv_tree.enum_inds: i.data.append(("Inventories", pd.DataFrame(np.zeros((1, len(data_cols))), columns = data_cols))) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if(cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if(checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols-1)) except ValueError: continue inv_data[cross_index-1] = cur_value # Data is in billions: inv_data[cross_index-1] = (10**9) * inv_data[cross_index-1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, output_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])] prev_ind = output_tree.enum_inds[int(proportions.iloc[0,j])] prev_df = prev_ind.data.dfs["INV"] if(sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) * (inv_data[i] * proportions.iloc[1,j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0,:]) for k in corp_types: inv_df["Corp"] += cur_dfs[k][0] for k in non_corp_types: inv_df["Non-Corp"] += cur_dfs[k][0] # return inv_tree
def read_bea(asset_tree): # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(_BEA_ASSET_PATH) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if (sht_pos == [-1, -1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0, 0], True) sht_pos[1] = sht_pos[1] - 1 if (sht_pos == [-1, -1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): # for rownum in xrange(sh.nrows): #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)]) number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts - 1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index - 1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i, 0] = asset_list.iloc[i, 0].replace("\xa0", " ") asset_list.iloc[i, 0] = asset_list.iloc[i, 0].strip() # Reading in the corresponding naics codes: naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry", "BEA Code", "NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts - 2, 3), dtype=object), columns=chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 for i in range(0, num_shts - 2): for cur_row in range(sht_pos[0] + 1, bea_readme.nrows): bea_code = unicode(bea_readme.cell_value(cur_row, cur_col + 1)).encode('utf8') if (str(bea_codes1[i]) == bea_code): bea_ind = unicode(bea_readme.cell_value( cur_row, cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter + 1) % num_naics if (naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # If they match except one has ".0" at the end: elif (str(bea_codes1[i]) == str( bea_readme.cell_value(cur_row, cur_col + 1))[:-2]): bea_ind = unicode(bea_readme.cell_value( cur_row, cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter + 1) % num_naics if (naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros( (asset_list.shape[0], bea_chart.shape[0])), columns=bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange( 0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j, 0] for k in xrange(sht_pos[0] + 2, cur_sht.nrows): cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1] + 1)).encode('utf8') cur_cell = cur_cell.replace("\xa0", " ").strip() if (cur_asset == cur_cell): bea_table[i][j] = float( cur_sht.cell_value(k, cur_sht.ncols - 1)) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * _BEA_IN_FILE_FCTR # Initialize tree for assets data: fixed_asset_tree = naics.generate_tree() for i in xrange(0, len(fixed_asset_tree.enum_inds)): fixed_asset_tree.enum_inds[i].data.append( ("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) fixed_asset_tree.enum_inds[i].data.append( ("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) fixed_asset_tree.enum_inds[i].data.append( ("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) # Fill in data from BEA's fixed asset table: enum_index = len(asset_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, asset_tree, "FA").iloc[1, :] corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _CORP_NMS).iloc[1, :] non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _NCORP_NMS).iloc[1, :] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(fixed_asset_tree.enum_inds)): enum_index = (enum_index + 1) % len(fixed_asset_tree.enum_inds) out_dfs = asset_tree.enum_inds[enum_index].data.dfs if (sum(out_dfs["FA"].iloc[0, :]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in _CORP_NMS: corp_ratio += (out_dfs["FA"][category][0] / sum(out_dfs["FA"].iloc[0, :])) for category in _NCORP_NMS: non_corp_ratio += (out_dfs["FA"][category][0] / sum(out_dfs["FA"].iloc[0, :])) cur_data = fixed_asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:, 0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if (share == 0): continue num_assets = fixed_asset_tree.enum_inds[0].data.dfs[ "All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0, k] = (bea_table.iloc[k, i] * all_ratio * all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0, k] = ( bea_table.iloc[k, i] * corp_ratio * corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0, k] = ( bea_table.iloc[k, i] * non_corp_ratio * non_corp_proportions[code_index]) break if (tot_share == 1): break # naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"]) naics.pop_forward(tree=fixed_asset_tree, df_list=["All"], blueprint="FA", blue_tree=asset_tree) naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_CORP_NMS) naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_NCORP_NMS) return fixed_asset_tree