Exemplo n.º 1
0
def load_soi_farm_prop(data_tree=naics.generate_tree(),
                       blue_tree=None,
                       blueprint=None,
                       from_out=False,
                       out_path=_FARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Load Farm Proprietorship data:
    farm_data = pd.read_csv(_FARM_IN_PATH)
    new_farm_cols = ["Land", "FA"]
    #
    data_tree.append_all(df_nm=_FARM_DF_NM, df_cols=new_farm_cols)
    #
    land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) *
                 (float(farm_data["A_sp"][0]) / farm_data["A_p"][0]))
    total = farm_data["R_p"][0] + farm_data["Q_p"][0]
    total_pa = 0
    cur_codes = [111, 112]
    proportions = np.zeros(len(cur_codes))
    proportions = naics.get_proportions(cur_codes, data_tree, _AST_PRT_DF_NM,
                                        [_LAND_COL_NM, _DEPR_COL_NM])
    #
    for ind_code in cur_codes:
        cur_ind = naics.find_naics(data_tree, ind_code)
        cur_df = cur_ind.data.dfs[_AST_PRT_DF_NM]
        total_pa += (cur_df[_LAND_COL_NM][0] + cur_df[_DEPR_COL_NM][0])
    #
    for i in xrange(0, len(cur_codes)):
        cur_ind = naics.find_naics(data_tree, cur_codes[i])
        cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = (
            land_mult * cur_ind.data.dfs[_AST_PRT_DF_NM][_LAND_COL_NM][0] /
            total_pa)
        cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = (
            (proportions.iloc[1, i] * total) -
            cur_ind.data.dfs[_FARM_DF_NM]["Land"][0])
    # Default:
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[
            0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM])
    naics.pop_forward(tree=data_tree,
                      df_list=[_FARM_DF_NM],
                      blueprint=blueprint,
                      blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 2
0
def read_land(asset_tree):
    land_data = pd.read_csv(_LAND_IN_PATH)
    land_data = _LAND_IN_FILE_FCTR * land_data
    # Initializing NAICS tree for the land data:
    df_cols = ["All", "Corp", "Non-Corp"]
    land_tree = naics.generate_tree()
    land_tree.append_all(df_nm="Land", df_cols=df_cols)
    ''' Calculate the proportion that belong in corporate and non-corporate
    tax categories:'''
    corp_sum = 0.0
    non_corp_sum = 0.0
    for i in _CORP_NMS:
        corp_sum += asset_tree.enum_inds[0].data.dfs["LAND"][i][0]
    for i in _NCORP_NMS:
        non_corp_sum += asset_tree.enum_inds[0].data.dfs["LAND"][i][0]
    if corp_sum + non_corp_sum == 0:
        return land_tree
    ''' Initialize the total industry category--corresponding to NAICS code 
    of "1": '''
    land_df = land_tree.enum_inds[0].data.dfs["Land"]
    land_df["Corp"][0] = land_data["Corporate"][0]
    land_df["Non-Corp"][0] = land_data["Non-Corporate"][0]
    land_df["All"][0] = (land_data["Corporate"][0]+
                            land_data["Non-Corporate"][0])
    # Use the asset_tree to populate the rest:
    naics.pop_back(land_tree, ["Land"])
    naics.pop_forward(land_tree, ["Land"], "LAND", asset_tree)
    return land_tree
Exemplo n.º 3
0
def load_type(data_tree=naics.generate_tree(),
               blue_tree = None, blueprint = None,
               from_out=False, out_path=None):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _TYP_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path,
                                        tree=data_tree)
        return data_tree
    # Opening data on income by partner type:
    wb = xlrd.open_workbook(_TYP_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Initializing dataframe to hold pertinent type income data:
    typ_df = pd.DataFrame(np.zeros((ws.ncols-1, len(_TYP_IN_ROW_NMS))),
                          columns=_TYP_DF_DICT.values())
    # Extracting the data. For each input row:
    for in_row_nm in _TYP_IN_ROW_NMS:
        
        df_col_key = _TYP_IN_ROWS_DF_DICT[in_row_nm]
        df_col_nm = _TYP_DF_DICT[df_col_key]
        in_row_nm = in_row_nm.lower()
        for ws_row_index in xrange(0, num_rows):
            ws_row_nm = str(ws.cell_value(ws_row_index,0)).lower()
            if(in_row_nm in ws_row_nm):
                typ_df[df_col_nm] = ws.row_values(ws_row_index,1)
                break
    # Scaling the data to the correct units:
    typ_df = typ_df * _TYP_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    typ_cross = pd.read_csv(_TYP_IN_CROSS_PATH)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=typ_df,
                    cross_df=typ_cross, df_nm=_TYP_DF_NM
                    )
    # Default blueprint is partner income, and, if not, then tot_corps:
    has_inc_df = _INC_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_inc_df:
        blueprint = _INC_DF_NM
    elif blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_TYP_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_TYP_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree
Exemplo n.º 4
0
def read_inventories(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(_INV_IN_PATH)
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0, 0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if (cur_index[1] != check_index[1]):
        print "ERROR"
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(_INV_IN_CROSS_PATH)
    # Creating a tree for the inventory data:
    data_cols = ["All", "Corp", "Non-Corp"]
    inv_tree = naics.generate_tree()
    inv_tree.append_all(df_nm="Inventories", df_cols=data_cols)
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if (cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1] + 1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and
                  (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if (checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols - 1))
            except ValueError:
                continue
            inv_data[cross_index - 1] = cur_value
            # Data is in billions:
            inv_data[cross_index -
                     1] = _INV_IN_FILE_FCTR * inv_data[cross_index - 1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, asset_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0, j])]
            prev_ind = asset_tree.enum_inds[int(proportions.iloc[0, j])]
            prev_df = prev_ind.data.dfs["INV"]
            if (sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df / sum(prev_df.iloc[0, :])) *
                           (inv_data[i] * proportions.iloc[1, j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0, :])
                for k in _CORP_NMS:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in _NCORP_NMS:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    naics.pop_back(inv_tree, ["Inventories"])
    naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree)
    return inv_tree
def load_soi_farm_prop(data_tree = naics.generate_tree(),
                       blue_tree = None, blueprint = None,
                       from_out=False, out_path=_FARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Load Farm Proprietorship data:
    farm_data = pd.read_csv(_FARM_IN_PATH)
    new_farm_cols = ["Land", "FA"]
    #
    for i in data_tree.enum_inds:
        i.append_dfs((_FARM_DF_NM,
                      pd.DataFrame(np.zeros((1,len(new_farm_cols))), 
                                   columns=new_farm_cols)))
    #
    land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * 
                        (float(farm_data["A_sp"][0])/farm_data["A_p"][0]))
    total = farm_data.iloc[0,0] + farm_data.iloc[0,2]
    total_pa = 0
    cur_codes = [111,112]
    proportions = np.zeros(len(cur_codes))
    proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", 
                                 ["Land (Net)","Depreciable assets (Net)"])
    #
    for i in xrange(0, len(cur_codes)):
        cur_ind = naics.find_naics(data_tree, cur_codes[i])
        cur_df = cur_ind.data.dfs["PA_assets"]
        total_pa += (cur_df["Land (Net)"][0] + 
                                cur_df["Depreciable assets (Net)"][0])
    #
    for i in xrange(0,len(cur_codes)):
        cur_ind = naics.find_naics(data_tree, cur_codes[i])
        cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = (land_mult * 
                            cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/
                            total_pa)
        cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ((proportions.iloc[1,i]*total)
                                    - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0])
    # Default:            
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 6
0
def read_inventories(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(_INV_IN_PATH)
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if(cur_index[1] != check_index[1]):
        print "ERROR"
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(_INV_IN_CROSS_PATH)
    # Creating a tree for the inventory data:
    data_cols = ["All", "Corp", "Non-Corp"]
    inv_tree = naics.generate_tree()
    inv_tree.append_all(df_nm="Inventories", df_cols=data_cols)
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if(cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and 
                    (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if(checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols-1))
            except ValueError:
                continue
            inv_data[cross_index-1] = cur_value
            # Data is in billions:
            inv_data[cross_index-1] = _INV_IN_FILE_FCTR * inv_data[cross_index-1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, asset_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_ind = asset_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_df = prev_ind.data.dfs["INV"]
            if(sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) *
                                (inv_data[i] * proportions.iloc[1,j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0,:])
                for k in _CORP_NMS:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in _NCORP_NMS:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    naics.pop_back(inv_tree, ["Inventories"])
    naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree)
    return inv_tree
Exemplo n.º 7
0
def load_income(data_tree=naics.generate_tree(),
                blue_tree=None, blueprint=None,
                from_out=False, out_path=None):
    """ This function loads the soi partnership income data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _INC_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, 
                                        tree=data_tree)
        return data_tree
    # Opening data on net income/loss:
    wb = xlrd.open_workbook(_INC_IN_PATH)
    ws = wb.sheet_by_index(0)
    start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1]
    # Initializing dataframe to hold pertinent income/loss data:
    data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), 
                           columns = _INC_PRT_DF_COL_NMS)
    # Extracting the data from the worksheet:
    for row in xrange(0, ws.nrows):
        # Going through each row of excel file, looking for input rows:
        if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col)
            data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col)
            break
        if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col)
    # Scaling the data to the correct units:
    data_df = data_df * _INC_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(_INC_IN_CROSS_PATH)
    # Processing the inc/loss data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=data_df,
                    cross_df=pa01cross, df_nm=_INC_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    
    return data_tree
Exemplo n.º 8
0
def load_pa_05_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constant factor (e.g. data is in thousands):
    pa_05_fctr = 10 ** 3
    # Defining constant list of types of partners:
    cols_05 = ["Corporate general partners", 
               "Corporate limited partners",
               "Individual general partners",
               "Individual limited partners",
               "Partnership general partners",
               "Partnership limited partners",
               "Tax-exempt organization general partners",
               "Tax-exempt organization limited partners",
               "Nominee and other general partners", 
               "Nominee and other limited partners"]
    if data_tree == None:
        data_tree = naics.generate_tree()
    #
    for i in os.listdir(prt_dir):
        if("pa05.xls" in i):
            pa_05_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa05_Crosswalk.csv" in i):
            pa_05_cross_file = os.path.abspath(prt_dir + "\\" + i)
    #
    book_05 = xlrd.open_workbook(pa_05_file)
    sheet_05 = book_05.sheet_by_index(0)
    cur_rows = sheet_05.nrows
    # Extracting the relevant data:
    data_05 = [None]*len(cols_05)
    for i in xrange(0, len(cols_05)):
        for row in xrange(0, cur_rows):
            if(cols_05[i].lower() in str(sheet_05.cell_value(row,0)).lower()):
                data_05[i] = sheet_05.row_values(row,1)
                break
    # Reformatting the data:
    data_05 = pd.DataFrame(data_05).T
    # Data is in thousands of dollars:
    data_05 = data_05 * pa_05_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa05cross = pd.read_csv(pa_05_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_05,
                    cross_df = pa05cross, data_cols = cols_05,
                    df_name = "PA_types"
                    )
    # Defaults:
    if blueprint == None and "PA_inc_loss" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "PA_inc_loss"
    elif blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_types"])
    naics.pop_forward(tree=data_tree, df_list=["PA_types"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 9
0
def main():
    #
    naics_tree = naics.load_naics(naics_codes_file)
    #
    read_wages.load_nipa_wages_ind(data_folder, naics_tree)
    #
    parameters = [read_wages.WAGES]
    #
    naics.pop_back(naics_tree, parameters)
    naics.pop_forward(naics_tree, parameters, None, None, None, True)
    #
    naics.print_tree_dfs(naics_tree, output_folder)
Exemplo n.º 10
0
def get_incs():
    #
    naics_tree = naics.generate_tree()
    #
    read_inc.load_nipa_inc_ind(data_folder, naics_tree)
    read_inc.load_nipa_int_ind(data_folder, naics_tree)
    read_inc.calc_bus_inc(naics_tree)
    #
    parameters = [read_inc.BUS_INC, read_inc.INT_INC, read_inc.FIN_INC]
    #
    naics.pop_back(naics_tree, parameters)
    naics.pop_forward(naics_tree, parameters)
    #
    naics.print_tree_dfs(naics_tree, output_folder)
    return naics_tree
Exemplo n.º 11
0
def get_incs():
    #
    naics_tree = naics.generate_tree()
    #
    read_inc.load_nipa_inc_ind(data_folder, naics_tree)
    read_inc.load_nipa_int_ind(data_folder, naics_tree)
    read_inc.calc_bus_inc(naics_tree)
    #
    parameters = [read_inc.BUS_INC, read_inc.INT_INC, read_inc.FIN_INC]
    #
    naics.pop_back(naics_tree, parameters)
    naics.pop_forward(naics_tree, parameters)
    #
    naics.print_tree_dfs(naics_tree, output_folder)
    return naics_tree
Exemplo n.º 12
0
def load_pa_01_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constants:
    pa_01_fctr = 10 ** 3
    #
    #if data_tree == None:
    #    data_tree = naics.generate_tree()
    # Names of the files with the partnership data:
    for i in os.listdir(prt_dir):
        if("pa01.xls" in i):
            pa_01_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa01_Crosswalk.csv" in i):
            pa_01_cross_file = os.path.abspath(prt_dir + "\\" + i)
    # Inputting data on net income/loss by industry from "**pa01.xls":
    book_01 = xlrd.open_workbook(pa_01_file)
    sheet_01 = book_01.sheet_by_index(0)
    num_rows = sheet_01.nrows
    # The data to be extracted:
    cols_01 = ["Total net income", "Total net loss", "Depreciation"]
    num_cols = sheet_01.ncols
    start_col = naics.search_ws(sheet_01, "All\nindustries", 20)[1]
    data_01 = pd.DataFrame(np.zeros((num_cols-start_col,3)), columns = cols_01)
    # Extracting the data:
    for i in xrange(0, num_rows):
        if("total net income" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Total net income"] = sheet_01.row_values(i+1,start_col)
            data_01["Total net loss"] = sheet_01.row_values(i+2,start_col)
            break
        if("depreciation" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Depreciation"] = sheet_01.row_values(i,start_col)
    #
    data_01 = data_01 * pa_01_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(pa_01_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_01,
                    cross_df = pa01cross, df_name = "PA_inc_loss"
                    )
    #
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_inc_loss"])
    naics.pop_forward(tree=data_tree, df_list=["PA_inc_loss"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 13
0
def load_soi_farm_prop(data_tree = None, blue_tree = None, blueprint = None):
    #
    if data_tree == None:
        data_tree = naics.generate_tree()
    #Load Farm Proprietorship data:
    farm_data = pd.read_csv(os.path.abspath(prop_dir + "\\Farm_Data.csv"))
    new_farm_cols = ["Land", "FA"]
    #
    for i in data_tree.enum_inds:
        i.append_dfs(("farm_prop",
                      pd.DataFrame(np.zeros((1,len(new_farm_cols))), 
                                   columns=new_farm_cols)))
    #
    land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * 
                        (float(farm_data["A_sp"][0])/farm_data["A_p"][0]))
    total = farm_data.iloc[0,0] + farm_data.iloc[0,2]
    total_pa = 0
    cur_codes = [111,112]
    proportions = np.zeros(len(cur_codes))
    proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", 
                                 ["Land (Net)","Depreciable assets (Net)"])
    #
    for i in xrange(0, len(cur_codes)):
        cur_ind = naics.find_naics(data_tree, cur_codes[i])
        cur_df = cur_ind.data.dfs["PA_assets"]
        total_pa += (cur_df["Land (Net)"][0] + 
                                cur_df["Depreciable assets (Net)"][0])
    #
    for i in xrange(0,len(cur_codes)):
        cur_ind = naics.find_naics(data_tree, cur_codes[i])
        cur_ind.data.dfs["farm_prop"]["Land"][0] = (land_mult * 
                            cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/
                            total_pa)
        cur_ind.data.dfs["farm_prop"]["FA"][0] = ((proportions.iloc[1,i]*total)
                                    - cur_ind.data.dfs["farm_prop"]["Land"][0])
    # Default:            
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["farm_prop"])
    naics.pop_forward(tree=data_tree, df_list=["farm_prop"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 14
0
def load_soi_tot_corp(data_tree=naics.generate_tree(),
                      cols_dict=_DFLT_TOT_CORP_COLS_DICT,
                      blueprint=None,
                      blue_tree=None,
                      from_out=False,
                      output_path=_TOT_CORP_OUT_PATH):
    """ This function pulls the soi total corporation data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree)
        return data_tree
    # Pertinent information:
    num_inds = len(data_tree.enum_inds)  # Number of industries in NAICS tree.
    data_cols = cols_dict.keys()  # Dataframe column names.
    # Opening the soi total corporate data file:
    try:
        tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0)
    except IOError:
        print "IOError: Tot-Corp soi data file not found."
        return None
    # Initializing dataframes for all NAICS industries:
    data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols)
    # Reading the total corporation data into the NAICS tree:
    enum_index = 0
    for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]):
        # Find the industry with a code that matches "code_num":
        ind_found = False
        for i in range(0, num_inds):
            enum_index = (enum_index + 1) % num_inds
            cur_ind = data_tree.enum_inds[enum_index]
            cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM]
            for j in range(0, cur_dfs.shape[0]):
                if (cur_dfs.iloc[j, 0] == code_num):
                    # Industry with the matching code has been found:
                    ind_found = True
                    cur_dfs = cur_ind.data.dfs[_TOT_DF_NM]
                    break
            # If the matching industry has been found stop searching for it:
            if ind_found:
                break
        # If no match was found, then ignore data.
        if not ind_found:
            continue
        # Indicators for if rows in tot_corp_data match current industry code:
        indicators = (tot_corp_data[_NAICS_COL_NM] == code_num)
        # Calculating the data:
        for j in cols_dict:
            # Some of the data may not be reported:
            if cols_dict[j] == "":
                cur_dfs[j] = 0
            else:
                # Note: double counting the data in the original dataset.
                cur_dfs[j][0] = sum(
                    indicators * tot_corp_data[cols_dict[j]]) / 2.0
                cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM])
    naics.pop_forward(tree=data_tree,
                      df_list=[_TOT_DF_NM],
                      blueprint=blueprint,
                      blue_tree=blue_tree)
    return data_tree
Exemplo n.º 15
0
def load_asset(data_tree=naics.generate_tree(),
             blue_tree=None, blueprint=None,
             from_out=False):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_file=_AST_OUT_PATH,
                                        tree=data_tree)
        return data_tree
    # Opening data on depreciable fixed assets, inventories, and land:
    wb = xlrd.open_workbook(_AST_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Columns of the asset dataframe:
    df_cols = _AST_DF_DICT.values()
    # Initializing dataframe to hold pertinent asset data:
    ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols)
    ''' Extracting the data (note that the rows with total data appear first).
    For each input row:'''
    for in_row_nm in _AST_IN_ROW_NMS:
        # Key corresponding to total asset column:
        df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm]
        # Asset dataframes net income column name:
        df_net_col_nm = _AST_DF_DICT[df_net_col_key]
        # Key corresponding to assets of net income partnerships column:
        df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm]
        # Asset dataframes total income column name:
        df_inc_col_nm = _AST_DF_DICT[df_inc_col_key]
        in_row_nm = in_row_nm.lower()
        # Finding the first input row with in_row_nm:
        for in_row1 in xrange(0, num_rows):
            in_net_row_nm = str(ws.cell_value(in_row1,0)).lower()
            if(in_row_nm in in_net_row_nm):
                # Total asset data:
                ast_df[df_net_col_nm] = ws.row_values(in_row1, 1)
                # Finding the second input row with in_row_nm:
                for in_row2 in xrange(in_row1+1, num_rows):
                    in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower()
                    if(in_row_nm in in_inc_row_nm):
                        # Asset data for companies with net income:
                        ast_df[df_inc_col_nm] = ws.row_values(in_row2,1)
                        break
                break
    # Scaling the data to the correct units:
    ast_df = ast_df * _AST_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    ast_cross = pd.read_csv(_AST_IN_CROSS_PATH)
    # Processing the asset data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=ast_df,
                    cross_df=ast_cross, df_nm=_AST_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree
Exemplo n.º 16
0
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), 
                          blue_tree=None, blueprint=None, 
                          from_out=False, out_path=_NFARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Opening nonfarm proprietor data:
    wb = xlrd.open_workbook(_DDCT_IN_PATH)
    ws = wb.sheet_by_index(0)
    cross = pd.read_csv(_DDCT_IN_CROSS_PATH)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0,0], True)
    pos2 = naics.search_ws(ws, _DDCT_COL1, 20)
    pos3 = naics.search_ws(ws,_DDCT_COL2, 20,
                           True, np.array(pos2) + np.array([0,1]))
    #
    data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM])
    #
    cross_index = cross.shape[0]-1
    enum_index = len(data_tree.enum_inds)-1
    for i in xrange(pos1[0],ws.nrows):
        cur_cell = str(ws.cell_value(i,pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cross.shape[0]):
            cross_index = (cross_index+1) % cross.shape[0]
            cur_ind_name = str(cross.iloc[cross_index,0]).lower().strip()
            if(cur_cell == cur_ind_name):
                if pd.isnull(cross.iloc[cross_index,1]):
                    continue
                ind_codes = str(cross.iloc[cross_index,1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index+1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs[_CODE_DF_NM]
                    cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM]
                    cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions 
                                        * (ws.cell_value(i,pos2[1]) 
                                        + ws.cell_value(i,pos3[1])))
            if(tot_proportions == 1):
                break
    # Default:
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 17
0
def load_soi_tot_corp(data_tree=naics.generate_tree(),
                      cols_dict=_DFLT_TOT_CORP_COLS_DICT, 
                      blueprint=None, blue_tree=None,
                      from_out=False, output_path=_TOT_CORP_OUT_PATH):
    """ This function pulls the soi total corporation data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree)
        return data_tree
    # Pertinent information:
    num_inds = len(data_tree.enum_inds) # Number of industries in NAICS tree.
    data_cols = cols_dict.keys() # Dataframe column names.
    # Opening the soi total corporate data file:
    try:
        tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0)
    except IOError:
        print "IOError: Tot-Corp soi data file not found."
        return None
    # Initializing dataframes for all NAICS industries:
    data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols)
    # Reading the total corporation data into the NAICS tree:
    enum_index = 0
    for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]):
        # Find the industry with a code that matches "code_num":
        ind_found = False
        for i in range(0, num_inds):
            enum_index = (enum_index + 1) % num_inds
            cur_ind = data_tree.enum_inds[enum_index]
            cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM]
            for j in range(0, cur_dfs.shape[0]):
                if(cur_dfs.iloc[j,0] == code_num):
                    # Industry with the matching code has been found:
                    ind_found = True
                    cur_dfs = cur_ind.data.dfs[_TOT_DF_NM]
                    break
            # If the matching industry has been found stop searching for it:
            if ind_found:
                break
        # If no match was found, then ignore data.
        if not ind_found:
            continue
        # Indicators for if rows in tot_corp_data match current industry code:
        indicators = (tot_corp_data[_NAICS_COL_NM] == code_num)
        # Calculating the data:
        for j in cols_dict:
            # Some of the data may not be reported:
            if cols_dict[j] == "":
                cur_dfs[j] = 0
            else:
                # Note: double counting the data in the original dataset.
                cur_dfs[j][0] = sum(indicators * tot_corp_data[cols_dict[j]])/2.0
                cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_TOT_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree
Exemplo n.º 18
0
# Reading in the SOI Tax Stats-Corporation Data:
naics.load_soi_corporate_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Partnership Data:
naics.load_soi_partner_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Proprietorship Data:
naics.load_soi_proprietor_data(data_tree, data_folder)

'''
Many industries are not listed in the SOI datasets. The data for these missing
    industries are interpolated.
'''
# Get a list of the names of all the pd dfs besides the list of codes:
cur_names = data_tree.enum_inds[0].data.dfs.keys()
cur_names.remove("Codes:")
# Populate missing industry data backwards throught the tree:
naics.pop_back(data_tree, cur_names)
# Populate the missing total corporate data forwards through the tree:
naics.pop_forward(data_tree, ["tot_corps"])
# Populate other missing data using tot_corps as a "blueprint":
cur_names = ["c_corps", "s_corps", "PA_inc/loss", "PA_assets", "soi_prop"]
naics.pop_forward(data_tree, cur_names, "tot_corps")
# Populate pa05 using pa01:
naics.pop_forward(data_tree, ["PA_types"], "PA_inc/loss")
#
naics.pop_back(data_tree, ["farm_prop"])
naics.pop_forward(data_tree, ["farm_prop"], "tot_corps")

#Create an output tree containing only the final data on FA, INV, and LAND.
output_tree = naics.summary_tree(data_tree, data_folder)

# Create a tree with all the FA's broken down by type of asset:
Exemplo n.º 19
0
def read_bea(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(_BEA_ASSET_PATH)
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if (sht_pos == [-1, -1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0, 0],
                                  True)
        sht_pos[1] = sht_pos[1] - 1
    if (sht_pos == [-1, -1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
        if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') !=
                ""):
            # for rownum in xrange(sh.nrows):
            #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)])
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts - 1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index - 1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') !=
                ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i, 0] = asset_list.iloc[i, 0].replace("\xa0", " ")
        asset_list.iloc[i, 0] = asset_list.iloc[i, 0].strip()
    # Reading in the corresponding naics codes:
    naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry", "BEA Code", "NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts - 2, 3), dtype=object),
                             columns=chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    for i in range(0, num_shts - 2):
        for cur_row in range(sht_pos[0] + 1, bea_readme.nrows):
            bea_code = unicode(bea_readme.cell_value(cur_row, cur_col +
                                                     1)).encode('utf8')
            if (str(bea_codes1[i]) == bea_code):
                bea_ind = unicode(bea_readme.cell_value(
                    cur_row, cur_col)).encode('utf8')
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_inds[i] = bea_ind
                bea_chart["BEA Code"][i] = bea_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter + 1) % num_naics
                    if (naics_inds[naics_counter] == bea_chart["Industry"][i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
            # If they match except one has ".0" at the end:
            elif (str(bea_codes1[i]) == str(
                    bea_readme.cell_value(cur_row, cur_col + 1))[:-2]):
                bea_ind = unicode(bea_readme.cell_value(
                    cur_row, cur_col)).encode('utf8')
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_chart["Industry"][i] = bea_ind
                cur_code = str(bea_readme.cell_value(cur_row,
                                                     cur_col + 1))[:-2]
                bea_chart["BEA Code"][i] = cur_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter + 1) % num_naics
                    if (naics_inds[naics_counter] == bea_inds[i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros(
        (asset_list.shape[0], bea_chart.shape[0])),
                             columns=bea_chart["BEA Code"])
    # For each industry, calculating
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(
                0, len(asset_list)):  #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j, 0]
            for k in xrange(sht_pos[0] + 2, cur_sht.nrows):
                cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1] +
                                                      1)).encode('utf8')
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if (cur_asset == cur_cell):
                    bea_table[i][j] = float(
                        cur_sht.cell_value(k, cur_sht.ncols - 1))
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * _BEA_IN_FILE_FCTR
    # Initialize tree for assets data:
    fixed_asset_tree = naics.generate_tree()
    for i in xrange(0, len(fixed_asset_tree.enum_inds)):
        fixed_asset_tree.enum_inds[i].data.append(
            ("All",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
        fixed_asset_tree.enum_inds[i].data.append(
            ("Corp",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
        fixed_asset_tree.enum_inds[i].data.append(
            ("Non-Corp",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(asset_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, asset_tree,
                                                "FA").iloc[1, :]
        corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA",
                                                 _CORP_NMS).iloc[1, :]
        non_corp_proportions = naics.get_proportions(cur_codes, asset_tree,
                                                     "FA",
                                                     _NCORP_NMS).iloc[1, :]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(fixed_asset_tree.enum_inds)):
                enum_index = (enum_index + 1) % len(fixed_asset_tree.enum_inds)
                out_dfs = asset_tree.enum_inds[enum_index].data.dfs
                if (sum(out_dfs["FA"].iloc[0, :]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in _CORP_NMS:
                    corp_ratio += (out_dfs["FA"][category][0] /
                                   sum(out_dfs["FA"].iloc[0, :]))
                for category in _NCORP_NMS:
                    non_corp_ratio += (out_dfs["FA"][category][0] /
                                       sum(out_dfs["FA"].iloc[0, :]))
                cur_data = fixed_asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:, 0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if (share == 0):
                    continue
                num_assets = fixed_asset_tree.enum_inds[0].data.dfs[
                    "All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,
                                             k] = (bea_table.iloc[k, i] *
                                                   all_ratio *
                                                   all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0, k] = (
                        bea_table.iloc[k, i] * corp_ratio *
                        corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0, k] = (
                        bea_table.iloc[k, i] * non_corp_ratio *
                        non_corp_proportions[code_index])
                break
            if (tot_share == 1):
                break
    #
    naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"])
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["All"],
                      blueprint="FA",
                      blue_tree=asset_tree)
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["Corp"],
                      blueprint="FA",
                      blue_tree=asset_tree,
                      sub_print=_CORP_NMS)
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["Non-Corp"],
                      blueprint="FA",
                      blue_tree=asset_tree,
                      sub_print=_NCORP_NMS)
    return fixed_asset_tree
Exemplo n.º 20
0
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(),
                          blue_tree=None,
                          blueprint=None,
                          from_out=False,
                          out_path=_NFARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Opening nonfarm proprietor data:
    wb = xlrd.open_workbook(_DDCT_IN_PATH)
    ws = wb.sheet_by_index(0)
    cross = pd.read_csv(_DDCT_IN_CROSS_PATH)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0, 0], True)
    pos2 = naics.search_ws(ws, _DDCT_COL1, 20)
    pos3 = naics.search_ws(ws, _DDCT_COL2, 20, True,
                           np.array(pos2) + np.array([0, 1]))
    #
    data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM])
    #
    cross_index = cross.shape[0] - 1
    enum_index = len(data_tree.enum_inds) - 1
    for i in xrange(pos1[0], ws.nrows):
        cur_cell = str(ws.cell_value(i, pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cross.shape[0]):
            cross_index = (cross_index + 1) % cross.shape[0]
            cur_ind_name = str(cross.iloc[cross_index, 0]).lower().strip()
            if (cur_cell == cur_ind_name):
                if pd.isnull(cross.iloc[cross_index, 1]):
                    continue
                ind_codes = str(cross.iloc[cross_index, 1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index + 1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs[_CODE_DF_NM]
                    cur_proportions = naics.compare_codes(
                        ind_codes, cur_codes.iloc[:, 0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM]
                    cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions *
                                   (ws.cell_value(i, pos2[1]) +
                                    ws.cell_value(i, pos3[1])))
            if (tot_proportions == 1):
                break
    # Default:
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[
            0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM])
    naics.pop_forward(tree=data_tree,
                      df_list=[_NFARM_DF_NM],
                      blueprint=blueprint,
                      blue_tree=blue_tree)
    #
    return data_tree
def init_depr_rates(data_tree=naics.generate_tree(), get_econ=False, 
                    get_tax_est=False, get_tax_150=False,
                    get_tax_200=False, get_tax_sl=False,
                    get_tax_ads=False, soi_from_out=False,
                    output_data=False):
    # Reading in the SOI Tax Stats-Corporation data:
    soi_tree = naics.generate_tree()
    soi_tree = read_soi.load_corporate(soi_tree=soi_tree, 
                                       from_out=soi_from_out,
                                       output_data=(not soi_from_out))
    # Reading in the SOI Tax Stats-Partnership data:
    soi_tree = read_soi.load_partner(soi_tree=soi_tree, 
                                     from_out=soi_from_out,
                                     output_data=(not soi_from_out))
    # Reading in the SOI Tax Stats-Proprietorship data:
    soi_tree = read_soi.load_soi_proprietorship(soi_tree=soi_tree, 
                                                from_out=soi_from_out,
                                                output_data=(not soi_from_out))
    '''
    Many industries are not listed in the SOI datasets. The data for these missing
        industries are interpolated.
    '''
    # Get a list of the names of all the pd dfs besides the list of codes:
    #cur_names = soi_tree.enum_inds[0].data.dfs.keys()
    #cur_names.remove(_CODE_DF_NM)
    # Populate missing industry data backwards throught the tree:
    #naics.pop_back(data_tree, cur_names)
    # Populate the missing total corporate data forwards through the tree:
    #naics.pop_forward(data_tree, ["tot_corps"])
    # Populate other missing data using tot_corps as a "blueprint":
    #cur_names = ["c_corps", "s_corps", "PA_inc_loss", "PA_assets", "soi_prop"]
    #naics.pop_forward(data_tree, cur_names, "tot_corps")
    # Calculate c_corps data:
    #read_soi.calc_c_corp(data_tree)
    #naics.pop_back(data_tree,["c_corps"])
    #naics.pop_forward(data_tree, ["c_corps"], "tot_corps")
    # Populate pa05 using pa01:
    #naics.pop_forward(data_tree, ["PA_types"], "PA_inc_loss")
    #
    #naics.pop_back(data_tree, ["farm_prop"])
    #naics.pop_forward(data_tree, ["farm_prop"], "tot_corps")
    
    #Create an output tree containing only the final data on FA, INV, and LAND.
    output_tree = calc_assets.summary_tree(data_tree, _DATA_DIR)
    # Create a tree with all the FA's broken down by type of asset:
    asset_tree = read_bea.read_bea(output_tree, _DATA_DIR)
    naics.pop_back(asset_tree, ["All", "Corp", "Non-Corp"])
    #
    corp_types = ["C Corporations",
                  "Corporate general partners", 
                  "Corporate limited partners"]
    non_corp_types = ["S Corporations",
                      "Individual general partners",
                      "Individual limited partners",
                      "Partnership general partners",
                      "Partnership limited partners",
                      "Tax-exempt organization general partners",
                      "Tax-exempt organization limited partners",
                      "Nominee and other general partners", 
                      "Nominee and other limited partners",
                      "Sole Proprietors"]
    naics.pop_forward(asset_tree, ["All"], "FA", output_tree)
    naics.pop_forward(asset_tree, ["Corp"], "FA", output_tree, corp_types)
    naics.pop_forward(asset_tree, ["Non-Corp"], "FA", output_tree, non_corp_types)
    #
    inv_tree = read_inv.read_inventories(output_tree, _DATA_DIR)
    naics.pop_back(inv_tree, ["Inventories"])
    naics.pop_forward(inv_tree, ["Inventories"])
    #
    land_tree = read_land.read_land(output_tree, _DATA_DIR)
    naics.pop_back(land_tree, ["Land"])
    naics.pop_forward(land_tree, ["Land"], "LAND", output_tree)
    #
    econ_depr_tree = calc_rates.calc_depr_rates(asset_tree, inv_tree, land_tree, _DATA_DIR)
    tax_depr_tree = calc_rates.calc_tax_depr_rates(asset_tree, inv_tree, land_tree, _DATA_DIR)
    naics.pop_rates(tax_depr_tree)
    return {"Econ": econ_depr_tree, "Tax": tax_depr_tree}
Exemplo n.º 22
0
def load_soi_tot_corp(data_tree = None, cols_dict = None, 
                      blue_tree = None, blueprint = None):
    """This function pulls SOI total corporate data.

    :param data_tree: A string to be converted?
    :returns: A bar formatted string?huh
    """
    if data_tree == None:
        data_tree = naics.generate_tree()
    # The aggregate 1120 filings data for all corporations:
    tot_corp_file = ""
    for i in os.listdir(corp_dir):
        if(i[4:] == "sb1.csv"):
            tot_corp_file = os.path.abspath(corp_dir + "\\" + i)
            break
    try:
        tot_corp_data = pd.read_csv(tot_corp_file).fillna(0)
    except IOError:
        print "IOError: Could not find tot-corp soi data file."
        return None
    # Listing the relevant columns that are being extracted from the dataset:
    if cols_dict == None:
        # Default:
        cols_dict = dict([("Depreciable Assets","DPRCBL_ASSTS"),
                      ("Accumulated Depreciation", "ACCUM_DPR"),
                      ("Land", "LAND"),
                      ("Inventories", "INVNTRY"),
                      ("Interest Paid", "INTRST_PD"), 
                      ("Capital Stock", "CAP_STCK"),
                      ("Additional paid-in Capital", "PD_CAP_SRPLS"),
                      ("Earnings (rtnd appr)", "RTND_ERNGS_APPR"),
                      ("Earnings (rtnd unappr.)", "COMP_RTND_ERNGS_UNAPPR"),
                      ("Cost of Treasury Stock", "CST_TRSRY_STCK")])
    data_cols = cols_dict.keys()
    # Initializing data on all corporations:
    for i in data_tree.enum_inds:
        i.append_dfs(("tot_corps", pd.DataFrame(np.zeros((1,len(data_cols))),
                                                columns = data_cols)))
    # Loading total-corporation data:
    enum_index = 0
    for code_num in np.unique(tot_corp_data["INDY_CD"]):
        # Find the industry with a code that matches "code_num":
        ind_found = False
        for i in range(0, len(data_tree.enum_inds)):
            enum_index = (enum_index + 1) % len(data_tree.enum_inds)
            cur_dfs = data_tree.enum_inds[i].data.dfs["Codes:"]
            for j in range(0, cur_dfs.shape[0]):
                if(cur_dfs.iloc[j,0] == code_num):
                    # Industry with the matching code has been found:
                    ind_found = True
                    cur_dfs = data_tree.enum_inds[i].data.dfs["tot_corps"]
                    break
            # If the matching industry has been found stop searching for it.
            if ind_found:
                break
        # If no match was found, then ignore data.
        if not ind_found:
            continue
        # Indicators for if rows in tot_corp_data match current industry code:
        indicators = (tot_corp_data["INDY_CD"] == code_num)
        # Filling in every column in the dataframe:
        for j in cols_dict:
            cur_dfs[j][0] = sum(indicators * tot_corp_data[cols_dict[j]])
    #
    naics.pop_back(tree=data_tree, df_list=["tot_corps"])
    naics.pop_forward(tree=data_tree, df_list=["tot_corps"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 23
0
def load_soi_prop_data(data_tree = None, blue_tree = None, blueprint = None):
    #
    prop_fctr = 10**3
    #
    if data_tree == None:
        data_tree = naics.generate_tree()
    # Finding the "\**sp01br" file in the proprietorships folder:
    for i in os.listdir(prop_dir):
        if(i[2:] == "sp01br.xls"):
            sp01br_file = os.path.abspath(prop_dir + "\\" + i)
        if(i[2:] == "sp01br_Crosswalk.csv"):
            sp01br_cross_file = os.path.abspath(prop_dir + "\\" + i)
    # Opening nonfarm proprietor data:
    cur_wb = xlrd.open_workbook(sp01br_file)
    cur_ws = cur_wb.sheet_by_index(0)
    cur_cross = pd.read_csv(sp01br_cross_file)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(cur_ws,"Industrial sector",20, True, [0,0], True)
    pos2 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20)
    pos3 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20,
                         True, np.array(pos2) + np.array([0,1]))
    #
    for i in data_tree.enum_inds:
        i.append_dfs(("soi_prop", pd.DataFrame(np.zeros((1,1)),
                                    columns = ["Depr Deductions"])))
    #
    cross_index = cur_cross.shape[0]-1
    enum_index = len(data_tree.enum_inds)-1
    for i in xrange(pos1[0],cur_ws.nrows):
        cur_cell = str(cur_ws.cell_value(i,pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cur_cross.shape[0]):
            cross_index = (cross_index+1) % cur_cross.shape[0]
            cur_ind_name = str(cur_cross.iloc[cross_index,0]).lower().strip()
            if(cur_cell == cur_ind_name):
                if pd.isnull(cur_cross.iloc[cross_index,1]):
                    continue
                ind_codes = str(cur_cross.iloc[cross_index,1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index+1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs["Codes:"]
                    #
                    #print ind_codes
                    #print cur_codes
                    cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs["soi_prop"]["Depr Deductions"]
                    cur_dfs[0] += (prop_fctr * cur_proportions 
                                        * (cur_ws.cell_value(i,pos2[1]) 
                                        + cur_ws.cell_value(i,pos3[1])))
            if(tot_proportions == 1):
                break
    # Default:
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["soi_prop"])
    naics.pop_forward(tree=data_tree, df_list=["soi_prop"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 24
0
def read_bea(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(_BEA_ASSET_PATH)
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if(sht_pos == [-1,-1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True)
        sht_pos[1] = sht_pos[1] - 1
    if(sht_pos == [-1,-1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
        if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""):    
       # for rownum in xrange(sh.nrows):
    #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)])    
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts-1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index-1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ")
        asset_list.iloc[i,0] = asset_list.iloc[i,0].strip()
    # Reading in the corresponding naics codes:
    naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry","BEA Code","NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object),
                             columns = chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    #for i in range(0, num_shts-2):
    i = 0
    for cur_row in range(sht_pos[0]+1, bea_readme.nrows):
        bea_code = unicode(bea_readme.cell_value(cur_row,cur_col+1)).encode('utf8')
        if(str(bea_codes1[i]) == bea_code):
            bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8')
            bea_ind = bea_ind.replace('\xa0', ' ').strip()
            bea_ind = bea_ind.replace('\xc2', '').strip()
            bea_inds[i] = bea_ind
            bea_chart["BEA Code"][i] = bea_code
            for k in xrange(0, num_naics):
                naics_counter = (naics_counter+1) % num_naics
                if(naics_inds[naics_counter] == bea_chart["Industry"][i]):
                   bea_naics[i] = naics_cross["NAICS"][naics_counter]
                   i += 1
                   break
        # If they match except one has ".0" at the end:
        elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]):
            bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8')
            bea_ind = bea_ind.replace('\xa0', ' ').strip()
            bea_ind = bea_ind.replace('\xc2', '').strip()
            bea_chart["Industry"][i] = bea_ind
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]
            bea_chart["BEA Code"][i] = cur_code
            for k in xrange(0, num_naics):
                naics_counter = (naics_counter+1) % num_naics
                if(naics_inds[naics_counter] == bea_inds[i]):
                    bea_naics[i] = naics_cross["NAICS"][naics_counter]
                    i += 1
                    break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros((asset_list.shape[0],
                                       bea_chart.shape[0])), 
                             columns = bea_chart["BEA Code"])
    # For each industry, calculating 
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j,0]
            for k in xrange(sht_pos[0]+2, cur_sht.nrows):
                cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1]+1)).encode('utf8')
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if(cur_asset == cur_cell):
                    bea_table[i][j] = float(cur_sht.cell_value(k, cur_sht.ncols-1))
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * _BEA_IN_FILE_FCTR
    # Initialize tree for assets data:
    fixed_asset_tree = naics.generate_tree()
    for i in xrange(0, len(fixed_asset_tree.enum_inds)):
        fixed_asset_tree.enum_inds[i].data.append(("All", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])), 
                             columns = asset_list.iloc[:,0])))
        fixed_asset_tree.enum_inds[i].data.append(("Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
        fixed_asset_tree.enum_inds[i].data.append(("Non-Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(asset_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, asset_tree, 
                                          "FA").iloc[1,:]
        corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", 
                                           _CORP_NMS).iloc[1,:]
        non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, 
                                               "FA", _NCORP_NMS).iloc[1,:]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(fixed_asset_tree.enum_inds)):
                enum_index = (enum_index+1) % len(fixed_asset_tree.enum_inds)
                out_dfs = asset_tree.enum_inds[enum_index].data.dfs
                if(sum(out_dfs["FA"].iloc[0,:]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in _CORP_NMS:
                    corp_ratio += (out_dfs["FA"][category][0]/
                                        sum(out_dfs["FA"].iloc[0,:]))
                for category in _NCORP_NMS:
                    non_corp_ratio += (out_dfs["FA"][category][0]/
                                            sum(out_dfs["FA"].iloc[0, :]))
                cur_data = fixed_asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:,0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if(share == 0):
                    continue
                num_assets = fixed_asset_tree.enum_inds[0].data.dfs["All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            all_ratio*
                                            all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            corp_ratio*
                                            corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            non_corp_ratio*
                                            non_corp_proportions[code_index])
                break
            if(tot_share == 1):
                break
    #
    naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"])
    naics.pop_forward(tree=fixed_asset_tree, df_list=["All"],
                      blueprint="FA", blue_tree=asset_tree)
    naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"],
                      blueprint="FA", blue_tree=asset_tree,
                      sub_print=_CORP_NMS)
    naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"],
                      blueprint="FA", blue_tree=asset_tree, 
                      sub_print=_NCORP_NMS)
    return fixed_asset_tree
Exemplo n.º 25
0
def load_pa_03_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constants:
    pa_03_fctr = 10 ** 3
    #
    if data_tree == None:
        data_tree = naics.generate_tree()
    #
    for i in os.listdir(prt_dir):
        if("pa03.xls" in i):
            pa_03_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa03_Crosswalk.csv" in i):
            pa_03_cross_file = os.path.abspath(prt_dir + "\\" + i)
    # Inputting data on depreciable fixed assets, inventories, and land:
    book_03 = xlrd.open_workbook(pa_03_file)
    sheet_03 = book_03.sheet_by_index(0)
    # Finding the relevant details about the table, e.g. dimensions:
    cur_rows = sheet_03.nrows
    # The following categories of data to be extracted:
    cols_03 = ["Depreciable assets (Net)", "Accumulated depreciation (Net)", 
               "Inventories (Net)", "Land (Net)", 
               "Depreciable assets (Income)", 
               "Accumulated depreciation (Income)", "Inventories (Income)",
               "Land (Income)"]
    # The more general column names that are used in the input file:
    gen_03 = ["Depreciable assets", "Accumulated depreciation", 
                    "Inventories", "Land"]
    # The data to be extracted on partnerships as a whole:
    tot_data_03 = [None]*len(gen_03)
    # The data to be extracted on partnerships with income:
    inc_data_03 = [None]*len(gen_03)
    # Extracting the data (note that the rows with total data appear first):
    for i in xrange(0, len(gen_03)):
        for row1 in xrange(0, cur_rows):
            if(gen_03[i].lower() in str(sheet_03.cell_value(row1,0)).lower()):
                tot_data_03[i] = sheet_03.row_values(row1,1)
                for row2 in xrange(row1+1, cur_rows):
                    cur_cell = str(sheet_03.cell_value(row2,0)).lower()
                    if(gen_03[i].lower() in cur_cell):
                        inc_data_03[i] = sheet_03.row_values(row2,1)
                        break
                break
    # Reformatting the data:
    data_03 = pd.concat([pd.DataFrame(tot_data_03).T,
                         pd.DataFrame(inc_data_03).T], axis = 1)
    # Data is in the thousands:
    data_03 = data_03 * pa_03_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa03cross = pd.read_csv(pa_03_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_03,
                    cross_df = pa03cross, data_cols = cols_03,
                    df_name = "PA_assets"
                    )
    #
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_assets"])
    naics.pop_forward(tree=data_tree, df_list=["PA_assets"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree
Exemplo n.º 26
0
def load_asset(data_tree=naics.generate_tree(),
             blue_tree=None, blueprint=None,
             from_out=False, out_path=None):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _AST_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path,
                                        tree=data_tree)
        return data_tree
    # Opening data on depreciable fixed assets, inventories, and land:
    wb = xlrd.open_workbook(_AST_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Columns of the asset dataframe:
    df_cols = _AST_DF_DICT.values()
    # Initializing dataframe to hold pertinent asset data:
    ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols)
    ''' Extracting the data (note that the rows with total data appear first).
    For each input row:'''
    for in_row_nm in _AST_IN_ROW_NMS:
        # Key corresponding to total asset column:
        df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm]
        # Asset dataframes net income column name:
        df_net_col_nm = _AST_DF_DICT[df_net_col_key]
        # Key corresponding to assets of net income partnerships column:
        df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm]
        # Asset dataframes total income column name:
        df_inc_col_nm = _AST_DF_DICT[df_inc_col_key]
        in_row_nm = in_row_nm.lower()
        # Finding the first input row with in_row_nm:
        for in_row1 in xrange(0, num_rows):
            in_net_row_nm = str(ws.cell_value(in_row1,0)).lower()
            if(in_row_nm in in_net_row_nm):
                # Total asset data:
                ast_df[df_net_col_nm] = ws.row_values(in_row1, 1)
                # Finding the second input row with in_row_nm:
                for in_row2 in xrange(in_row1+1, num_rows):
                    in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower()
                    if(in_row_nm in in_inc_row_nm):
                        # Asset data for companies with net income:
                        ast_df[df_inc_col_nm] = ws.row_values(in_row2,1)
                        break
                break
    # Scaling the data to the correct units:
    ast_df = ast_df * _AST_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    ast_cross = pd.read_csv(_AST_IN_CROSS_PATH)
    # Processing the asset data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=ast_df,
                    cross_df=ast_cross, df_nm=_AST_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree
Exemplo n.º 27
0
data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
# Reading in the SOI Tax Stats-Corporation Data:
naics.load_soi_corporate_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Partnership Data:
naics.load_soi_partner_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Proprietorship Data:
naics.load_soi_proprietor_data(data_tree, data_folder)
'''
Many industries are not listed in the SOI datasets. The data for these missing
    industries are interpolated.
'''
# Get a list of the names of all the pd dfs besides the list of codes:
cur_names = data_tree.enum_inds[0].data.dfs.keys()
cur_names.remove("Codes:")
# Populate missing industry data backwards throught the tree:
naics.pop_back(data_tree, cur_names)
# Populate the missing total corporate data forwards through the tree:
naics.pop_forward(data_tree, ["tot_corps"])
# Populate other missing data using tot_corps as a "blueprint":
cur_names = ["c_corps", "s_corps", "PA_inc/loss", "PA_assets", "soi_prop"]
naics.pop_forward(data_tree, cur_names, "tot_corps")
# Populate pa05 using pa01:
naics.pop_forward(data_tree, ["PA_types"], "PA_inc/loss")
#
naics.pop_back(data_tree, ["farm_prop"])
naics.pop_forward(data_tree, ["farm_prop"], "tot_corps")

#Create an output tree containing only the final data on FA, INV, and LAND.
output_tree = naics.summary_tree(data_tree, data_folder)

# Create a tree with all the FA's broken down by type of asset: