def calibrate_debt(debt_tree=None, soi_tree=None, from_out=False, soi_from_out=False): if soi_tree == None: soi_tree = pull_soi_data(get_corp=True, from_out=soi_from_out) if debt_tree == None: debt_tree = naics.generate_tree() # debt_dir = os.path.abspath(_PARAM_DIR + "\\debt") debt_data_dir = os.path.abspath(debt_dir + "\\data") sys.path.append(debt_dir) import debt_calibration as debt # lblty_file = os.path.abspath(debt_data_dir + "\\liabilities.csv") print lblty_file lblty_df = pd.read_csv(lblty_file) eqty_file = os.path.abspath(debt_data_dir + "\\equity.csv") eqty_df = pd.read_csv(eqty_file) debt_tree = naics.load_tree_dfs(input_file=lblty_file, dfs_name="liabilities", tree=debt_tree) debt_tree = naics.load_tree_dfs(input_file=eqty_file, dfs_name="equity", tree=debt_tree) # naics.pop_forward(tree=debt_tree, df_list=["liabilities"], blue_tree=soi_tree, blueprint="tot_corps", sub_print = ["Interest Paid"]) # return debt_tree
def calibrate_debt(debt_tree=naics.generate_tree(), soi_tree=None, from_out=False, soi_from_out=False): """ This function is incomplete. This is supposed to do the debt calibrations. :param debt_tree: The NAICS tree to append the calibrated debt parameters to. Default is a newly generated tree. :param soi_tree: A tree with all of the relevant soi data. : """ if soi_tree == None: soi_tree = pull_soi_data(get_corp=True, from_out=soi_from_out) # debt_dir = os.path.abspath(_PARAM_DIR + "//debt") debt_data_dir = os.path.abspath(debt_dir + "//data") sys.path.append(debt_dir) import debt_calibration as debt # lblty_file = os.path.abspath(debt_data_dir + "//liabilities.csv") print lblty_file lblty_df = pd.read_csv(lblty_file) eqty_file = os.path.abspath(debt_data_dir + "//equity.csv") eqty_df = pd.read_csv(eqty_file) debt_tree = naics.load_tree_dfs(input_file=lblty_file, dfs_name="liabilities", tree=debt_tree) debt_tree = naics.load_tree_dfs(input_file=eqty_file, dfs_name="equity", tree=debt_tree) # naics.pop_forward( tree=debt_tree, df_list=["liabilities"], blue_tree=soi_tree, blueprint="tot_corps", sub_print=["Interest Paid"] ) # return debt_tree
def load_soi_farm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # data_tree.append_all(df_nm=_FARM_DF_NM, df_cols=new_farm_cols) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0]) / farm_data["A_p"][0])) total = farm_data["R_p"][0] + farm_data["Q_p"][0] total_pa = 0 cur_codes = [111, 112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, _AST_PRT_DF_NM, [_LAND_COL_NM, _DEPR_COL_NM]) # for ind_code in cur_codes: cur_ind = naics.find_naics(data_tree, ind_code) cur_df = cur_ind.data.dfs[_AST_PRT_DF_NM] total_pa += (cur_df[_LAND_COL_NM][0] + cur_df[_DEPR_COL_NM][0]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = ( land_mult * cur_ind.data.dfs[_AST_PRT_DF_NM][_LAND_COL_NM][0] / total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ( (proportions.iloc[1, i] * total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[ 0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_type(data_tree=naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _TYP_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on income by partner type: wb = xlrd.open_workbook(_TYP_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Initializing dataframe to hold pertinent type income data: typ_df = pd.DataFrame(np.zeros((ws.ncols-1, len(_TYP_IN_ROW_NMS))), columns=_TYP_DF_DICT.values()) # Extracting the data. For each input row: for in_row_nm in _TYP_IN_ROW_NMS: df_col_key = _TYP_IN_ROWS_DF_DICT[in_row_nm] df_col_nm = _TYP_DF_DICT[df_col_key] in_row_nm = in_row_nm.lower() for ws_row_index in xrange(0, num_rows): ws_row_nm = str(ws.cell_value(ws_row_index,0)).lower() if(in_row_nm in ws_row_nm): typ_df[df_col_nm] = ws.row_values(ws_row_index,1) break # Scaling the data to the correct units: typ_df = typ_df * _TYP_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: typ_cross = pd.read_csv(_TYP_IN_CROSS_PATH) # data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=typ_df, cross_df=typ_cross, df_nm=_TYP_DF_NM ) # Default blueprint is partner income, and, if not, then tot_corps: has_inc_df = _INC_DF_NM in data_tree.enum_inds[0].data.dfs.keys() has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_inc_df: blueprint = _INC_DF_NM elif blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TYP_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TYP_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_soi_farm_prop(data_tree = naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # for i in data_tree.enum_inds: i.append_dfs((_FARM_DF_NM, pd.DataFrame(np.zeros((1,len(new_farm_cols))), columns=new_farm_cols))) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0])/farm_data["A_p"][0])) total = farm_data.iloc[0,0] + farm_data.iloc[0,2] total_pa = 0 cur_codes = [111,112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", ["Land (Net)","Depreciable assets (Net)"]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_df = cur_ind.data.dfs["PA_assets"] total_pa += (cur_df["Land (Net)"][0] + cur_df["Depreciable assets (Net)"][0]) # for i in xrange(0,len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = (land_mult * cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/ total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ((proportions.iloc[1,i]*total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_income(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership income data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _INC_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on net income/loss: wb = xlrd.open_workbook(_INC_IN_PATH) ws = wb.sheet_by_index(0) start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1] # Initializing dataframe to hold pertinent income/loss data: data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), columns = _INC_PRT_DF_COL_NMS) # Extracting the data from the worksheet: for row in xrange(0, ws.nrows): # Going through each row of excel file, looking for input rows: if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col) data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col) break if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col) # Scaling the data to the correct units: data_df = data_df * _INC_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: pa01cross = pd.read_csv(_INC_IN_CROSS_PATH) # Processing the inc/loss data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=data_df, cross_df=pa01cross, df_nm=_INC_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def calibrate_debt(debt_tree=naics.generate_tree(), soi_tree=None, from_out=False, soi_from_out=False): """ This function is incomplete. This is supposed to do the debt calibrations. :param debt_tree: The NAICS tree to append the calibrated debt parameters to. Default is a newly generated tree. :param soi_tree: A tree with all of the relevant soi data. : """ if soi_tree == None: soi_tree = pull_soi_data(get_corp=True, from_out=soi_from_out) # debt_dir = os.path.abspath(_PARAM_DIR + "//debt") debt_data_dir = os.path.abspath(debt_dir + "//data") sys.path.append(debt_dir) import debt_calibration as debt # lblty_file = os.path.abspath(debt_data_dir + "//liabilities.csv") print lblty_file lblty_df = pd.read_csv(lblty_file) eqty_file = os.path.abspath(debt_data_dir + "//equity.csv") eqty_df = pd.read_csv(eqty_file) debt_tree = naics.load_tree_dfs(input_file=lblty_file, dfs_name="liabilities", tree=debt_tree) debt_tree = naics.load_tree_dfs(input_file=eqty_file, dfs_name="equity", tree=debt_tree) # naics.pop_forward(tree=debt_tree, df_list=["liabilities"], blue_tree=soi_tree, blueprint="tot_corps", sub_print=["Interest Paid"]) # return debt_tree
def calc_c_corp( data_tree=naics.generate_tree(), from_out=False, out_path=_C_CORP_OUT_PATH): """ This function calculates the soi c-corporation data based of the s and the aggregate corporation data. :param data_tree: The tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree ''' For each industry, subtract the s-corporation data from the total to get the c-corporation data.''' for ind in data_tree.enum_inds: try: # Industry's total-corporation data: cur_tot = ind.data.dfs[_TOT_DF_NM] except KeyError: print "Total-Corp data not initialized when interpolating C-Corp." try: # Industry's S-corporation data: cur_s = ind.data.dfs[_S_DF_NM] except KeyError: print "S-Corp data not initialized when interpolating C-Corp." data_cols = cur_tot.columns.values.tolist() # Append C-corporation dataframe: ind.append_dfs((_C_DF_NM, pd.DataFrame(np.zeros((1, len(data_cols))), columns=data_cols))) # C-corporation data: ind.data.dfs[_C_DF_NM] = cur_tot - cur_s return data_tree
def calc_c_corp(data_tree=naics.generate_tree(), from_out=False, out_path=_C_CORP_OUT_PATH): """ This function calculates the soi c-corporation data based of the s and the aggregate corporation data. :param data_tree: The tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree ''' For each industry, subtract the s-corporation data from the total to get the c-corporation data.''' for ind in data_tree.enum_inds: try: # Industry's total-corporation data: cur_tot = ind.data.dfs[_TOT_DF_NM] except KeyError: print "Total-Corp data not initialized when interpolating C-Corp." try: # Industry's S-corporation data: cur_s = ind.data.dfs[_S_DF_NM] except KeyError: print "S-Corp data not initialized when interpolating C-Corp." data_cols = cur_tot.columns.values.tolist() # Append C-corporation dataframe: ind.append_dfs((_C_DF_NM, pd.DataFrame(np.zeros((1,len(data_cols))), columns = data_cols))) # C-corporation data: ind.data.dfs[_C_DF_NM] = cur_tot - cur_s return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_file=_AST_OUT_PATH, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_NFARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening nonfarm proprietor data: wb = xlrd.open_workbook(_DDCT_IN_PATH) ws = wb.sheet_by_index(0) cross = pd.read_csv(_DDCT_IN_CROSS_PATH) # Finding the relevant positions in worksheet: pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0,0], True) pos2 = naics.search_ws(ws, _DDCT_COL1, 20) pos3 = naics.search_ws(ws,_DDCT_COL2, 20, True, np.array(pos2) + np.array([0,1])) # data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM]) # cross_index = cross.shape[0]-1 enum_index = len(data_tree.enum_inds)-1 for i in xrange(pos1[0],ws.nrows): cur_cell = str(ws.cell_value(i,pos1[1])).lower().strip() # tot_proportions = 0 for j in xrange(0, cross.shape[0]): cross_index = (cross_index+1) % cross.shape[0] cur_ind_name = str(cross.iloc[cross_index,0]).lower().strip() if(cur_cell == cur_ind_name): if pd.isnull(cross.iloc[cross_index,1]): continue ind_codes = str(cross.iloc[cross_index,1]).split(".") for k in xrange(0, len(data_tree.enum_inds)): enum_index = (enum_index+1) % len(data_tree.enum_inds) cur_data = data_tree.enum_inds[enum_index].data cur_codes = cur_data.dfs[_CODE_DF_NM] cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0]) if cur_proportions == 0: continue tot_proportions += cur_proportions cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM] cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions * (ws.cell_value(i,pos2[1]) + ws.cell_value(i,pos3[1]))) if(tot_proportions == 1): break # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_tot_corp(data_tree=naics.generate_tree(), cols_dict=_DFLT_TOT_CORP_COLS_DICT, blueprint=None, blue_tree=None, from_out=False, output_path=_TOT_CORP_OUT_PATH): """ This function pulls the soi total corporation data. :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree) return data_tree # Pertinent information: num_inds = len(data_tree.enum_inds) # Number of industries in NAICS tree. data_cols = cols_dict.keys() # Dataframe column names. # Opening the soi total corporate data file: try: tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0) except IOError: print "IOError: Tot-Corp soi data file not found." return None # Initializing dataframes for all NAICS industries: data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols) # Reading the total corporation data into the NAICS tree: enum_index = 0 for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]): # Find the industry with a code that matches "code_num": ind_found = False for i in range(0, num_inds): enum_index = (enum_index + 1) % num_inds cur_ind = data_tree.enum_inds[enum_index] cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM] for j in range(0, cur_dfs.shape[0]): if(cur_dfs.iloc[j,0] == code_num): # Industry with the matching code has been found: ind_found = True cur_dfs = cur_ind.data.dfs[_TOT_DF_NM] break # If the matching industry has been found stop searching for it: if ind_found: break # If no match was found, then ignore data. if not ind_found: continue # Indicators for if rows in tot_corp_data match current industry code: indicators = (tot_corp_data[_NAICS_COL_NM] == code_num) # Calculating the data: for j in cols_dict: # Some of the data may not be reported: if cols_dict[j] == "": cur_dfs[j] = 0 else: # Note: double counting the data in the original dataset. cur_dfs[j][0] = sum(indicators * tot_corp_data[cols_dict[j]])/2.0 cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TOT_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _AST_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_NFARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening nonfarm proprietor data: wb = xlrd.open_workbook(_DDCT_IN_PATH) ws = wb.sheet_by_index(0) cross = pd.read_csv(_DDCT_IN_CROSS_PATH) # Finding the relevant positions in worksheet: pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0, 0], True) pos2 = naics.search_ws(ws, _DDCT_COL1, 20) pos3 = naics.search_ws(ws, _DDCT_COL2, 20, True, np.array(pos2) + np.array([0, 1])) # data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM]) # cross_index = cross.shape[0] - 1 enum_index = len(data_tree.enum_inds) - 1 for i in xrange(pos1[0], ws.nrows): cur_cell = str(ws.cell_value(i, pos1[1])).lower().strip() # tot_proportions = 0 for j in xrange(0, cross.shape[0]): cross_index = (cross_index + 1) % cross.shape[0] cur_ind_name = str(cross.iloc[cross_index, 0]).lower().strip() if (cur_cell == cur_ind_name): if pd.isnull(cross.iloc[cross_index, 1]): continue ind_codes = str(cross.iloc[cross_index, 1]).split(".") for k in xrange(0, len(data_tree.enum_inds)): enum_index = (enum_index + 1) % len(data_tree.enum_inds) cur_data = data_tree.enum_inds[enum_index].data cur_codes = cur_data.dfs[_CODE_DF_NM] cur_proportions = naics.compare_codes( ind_codes, cur_codes.iloc[:, 0]) if cur_proportions == 0: continue tot_proportions += cur_proportions cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM] cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions * (ws.cell_value(i, pos2[1]) + ws.cell_value(i, pos3[1]))) if (tot_proportions == 1): break # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[ 0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_tot_corp(data_tree=naics.generate_tree(), cols_dict=_DFLT_TOT_CORP_COLS_DICT, blueprint=None, blue_tree=None, from_out=False, output_path=_TOT_CORP_OUT_PATH): """ This function pulls the soi total corporation data. :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree) return data_tree # Pertinent information: num_inds = len(data_tree.enum_inds) # Number of industries in NAICS tree. data_cols = cols_dict.keys() # Dataframe column names. # Opening the soi total corporate data file: try: tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0) except IOError: print "IOError: Tot-Corp soi data file not found." return None # Initializing dataframes for all NAICS industries: data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols) # Reading the total corporation data into the NAICS tree: enum_index = 0 for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]): # Find the industry with a code that matches "code_num": ind_found = False for i in range(0, num_inds): enum_index = (enum_index + 1) % num_inds cur_ind = data_tree.enum_inds[enum_index] cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM] for j in range(0, cur_dfs.shape[0]): if (cur_dfs.iloc[j, 0] == code_num): # Industry with the matching code has been found: ind_found = True cur_dfs = cur_ind.data.dfs[_TOT_DF_NM] break # If the matching industry has been found stop searching for it: if ind_found: break # If no match was found, then ignore data. if not ind_found: continue # Indicators for if rows in tot_corp_data match current industry code: indicators = (tot_corp_data[_NAICS_COL_NM] == code_num) # Calculating the data: for j in cols_dict: # Some of the data may not be reported: if cols_dict[j] == "": cur_dfs[j] = 0 else: # Note: double counting the data in the original dataset. cur_dfs[j][0] = sum( indicators * tot_corp_data[cols_dict[j]]) / 2.0 cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TOT_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree