Python search_ws 예제들, naics_processing.search_ws Python 예제들

예제 #1

0

파일 보기

def read_inventories(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(_INV_IN_PATH)
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0, 0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if (cur_index[1] != check_index[1]):
        print "ERROR"
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(_INV_IN_CROSS_PATH)
    # Creating a tree for the inventory data:
    data_cols = ["All", "Corp", "Non-Corp"]
    inv_tree = naics.generate_tree()
    inv_tree.append_all(df_nm="Inventories", df_cols=data_cols)
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if (cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1] + 1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and
                  (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if (checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols - 1))
            except ValueError:
                continue
            inv_data[cross_index - 1] = cur_value
            # Data is in billions:
            inv_data[cross_index -
                     1] = _INV_IN_FILE_FCTR * inv_data[cross_index - 1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, asset_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0, j])]
            prev_ind = asset_tree.enum_inds[int(proportions.iloc[0, j])]
            prev_df = prev_ind.data.dfs["INV"]
            if (sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df / sum(prev_df.iloc[0, :])) *
                           (inv_data[i] * proportions.iloc[1, j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0, :])
                for k in _CORP_NMS:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in _NCORP_NMS:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    naics.pop_back(inv_tree, ["Inventories"])
    naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree)
    return inv_tree

예제 #2

0

파일 보기

파일: read_inventories.py 프로젝트: talumbau/B-Tax

def read_inventories(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(_INV_IN_PATH)
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if(cur_index[1] != check_index[1]):
        print "ERROR"
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(_INV_IN_CROSS_PATH)
    # Creating a tree for the inventory data:
    data_cols = ["All", "Corp", "Non-Corp"]
    inv_tree = naics.generate_tree()
    inv_tree.append_all(df_nm="Inventories", df_cols=data_cols)
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if(cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and 
                    (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if(checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols-1))
            except ValueError:
                continue
            inv_data[cross_index-1] = cur_value
            # Data is in billions:
            inv_data[cross_index-1] = _INV_IN_FILE_FCTR * inv_data[cross_index-1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, asset_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_ind = asset_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_df = prev_ind.data.dfs["INV"]
            if(sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) *
                                (inv_data[i] * proportions.iloc[1,j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0,:])
                for k in _CORP_NMS:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in _NCORP_NMS:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    naics.pop_back(inv_tree, ["Inventories"])
    naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree)
    return inv_tree

예제 #3

0

파일 보기

def load_income(data_tree=naics.generate_tree(),
                blue_tree=None, blueprint=None,
                from_out=False, out_path=None):
    """ This function loads the soi partnership income data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _INC_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, 
                                        tree=data_tree)
        return data_tree
    # Opening data on net income/loss:
    wb = xlrd.open_workbook(_INC_IN_PATH)
    ws = wb.sheet_by_index(0)
    start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1]
    # Initializing dataframe to hold pertinent income/loss data:
    data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), 
                           columns = _INC_PRT_DF_COL_NMS)
    # Extracting the data from the worksheet:
    for row in xrange(0, ws.nrows):
        # Going through each row of excel file, looking for input rows:
        if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col)
            data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col)
            break
        if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col)
    # Scaling the data to the correct units:
    data_df = data_df * _INC_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(_INC_IN_CROSS_PATH)
    # Processing the inc/loss data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=data_df,
                    cross_df=pa01cross, df_nm=_INC_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    
    return data_tree

예제 #4

0

파일 보기

파일: pull_soi.py 프로젝트: evan-magnusson/dynamic

def load_pa_01_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constants:
    pa_01_fctr = 10 ** 3
    #
    #if data_tree == None:
    #    data_tree = naics.generate_tree()
    # Names of the files with the partnership data:
    for i in os.listdir(prt_dir):
        if("pa01.xls" in i):
            pa_01_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa01_Crosswalk.csv" in i):
            pa_01_cross_file = os.path.abspath(prt_dir + "\\" + i)
    # Inputting data on net income/loss by industry from "**pa01.xls":
    book_01 = xlrd.open_workbook(pa_01_file)
    sheet_01 = book_01.sheet_by_index(0)
    num_rows = sheet_01.nrows
    # The data to be extracted:
    cols_01 = ["Total net income", "Total net loss", "Depreciation"]
    num_cols = sheet_01.ncols
    start_col = naics.search_ws(sheet_01, "All\nindustries", 20)[1]
    data_01 = pd.DataFrame(np.zeros((num_cols-start_col,3)), columns = cols_01)
    # Extracting the data:
    for i in xrange(0, num_rows):
        if("total net income" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Total net income"] = sheet_01.row_values(i+1,start_col)
            data_01["Total net loss"] = sheet_01.row_values(i+2,start_col)
            break
        if("depreciation" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Depreciation"] = sheet_01.row_values(i,start_col)
    #
    data_01 = data_01 * pa_01_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(pa_01_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_01,
                    cross_df = pa01cross, df_name = "PA_inc_loss"
                    )
    #
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_inc_loss"])
    naics.pop_forward(tree=data_tree, df_list=["PA_inc_loss"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

예제 #5

0

파일 보기

파일: read_income_data.py 프로젝트: salimfurth/OG-USA

def load_nipa_ind(data_file, cross_file):
    #data_folder = "N:\\Lott, Sherwin\\Other Calibration\\Program\\national_income\\data"
    data_book = xlrd.open_workbook(data_file)
    data_sht = data_book.sheet_by_index(0)
    #
    data_cross = pd.read_csv(cross_file)
    #data_cross = data_cross.fillna(-1)
    #data_cross = pd.DataFrame(data_cross[data_cross["NAICS Code:"] != -1])
    output = np.zeros(data_cross.shape[0])

    start_pos = naics.search_ws(data_sht, "Line", 25, True, [0, 0], True)
    for i in xrange(start_pos[0] + 1, data_sht.nrows):
        if (str(data_sht.cell_value(i, start_pos[1])) == "1"):
            start_pos[0] = i
            break

    cur_row = start_pos[0]
    ind_col = start_pos[1] + 1
    data_col = data_sht.ncols - 1

    for i in xrange(0, data_sht.ncols):
        try:
            float(data_sht.cell_value(cur_row, data_col))
            break
        except ValueError:
            data_col -= 1

    for i in xrange(0, data_cross.shape[0]):

        for j in xrange(start_pos[0], data_sht.nrows):
            try:
                if (data_cross["Industry"][i]
                        in data_sht.cell_value(cur_row, ind_col)):
                    output[i] = data_sht.cell_value(cur_row, data_col)
                    cur_row = start_pos[0] + ((cur_row + 1 - start_pos[0]) %
                                              (data_sht.nrows - start_pos[0]))
                    break
                cur_row = start_pos[0] + ((cur_row + 1 - start_pos[0]) %
                                          (data_sht.nrows - start_pos[0]))
            except ValueError:
                cur_row = start_pos[0] + ((cur_row + 1 - start_pos[0]) %
                                          (data_sht.nrows - start_pos[0]))

    return pd.DataFrame(np.column_stack((data_cross["NAICS_Code"], output)),
                        columns=["NAICS Codes:", ""])

예제 #6

0

파일 보기

파일: read_wages_data.py 프로젝트: chrisrytting/OG-USA

def load_nipa_ind(data_file, cross_file):
    #data_folder = "N:\\Lott, Sherwin\\Other Calibration\\Program\\national_income\\data"
    data_book = xlrd.open_workbook(data_file)
    data_sht = data_book.sheet_by_index(0)
    #
    data_cross = pd.read_csv(cross_file)
    #data_cross = data_cross.fillna(-1)
    #data_cross = pd.DataFrame(data_cross[data_cross["NAICS Code:"] != -1])
    output = np.zeros(data_cross.shape[0])
    
    start_pos = naics.search_ws(data_sht, "Line", 25, True, [0,0], True)
    for i in xrange(start_pos[0]+1, data_sht.nrows):
        if(str(data_sht.cell_value(i,start_pos[1])) == "1"):
            start_pos[0] = i
            break
    
    cur_row = start_pos[0]
    ind_col = start_pos[1] + 1
    data_col = data_sht.ncols - 1
    
    for i in xrange(0, data_sht.ncols):
        try:
            float(data_sht.cell_value(cur_row, data_col))
            break
        except ValueError:
            data_col -= 1
    
    for i in xrange(0, data_cross.shape[0]):
        
        for j in xrange(start_pos[0], data_sht.nrows):
            try:
                if(data_cross["Industry"][i] in data_sht.cell_value(cur_row, ind_col)):
                    output[i] = data_sht.cell_value(cur_row, data_col)
                    cur_row = start_pos[0] + ((cur_row+1-start_pos[0]) % (data_sht.nrows-start_pos[0]))
                    break
                cur_row = start_pos[0] + ((cur_row+1-start_pos[0]) % (data_sht.nrows-start_pos[0]))
            except ValueError:
                cur_row = start_pos[0] + ((cur_row+1-start_pos[0]) % (data_sht.nrows-start_pos[0]))
    
    return pd.DataFrame(np.column_stack((data_cross["NAICS_Code"], output)), columns = ["NAICS Codes:", ""])

예제 #7

0

파일 보기

파일: pull_soi_proprietorship.py 프로젝트: chrisrytting/OG-USA

def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), 
                          blue_tree=None, blueprint=None, 
                          from_out=False, out_path=_NFARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Opening nonfarm proprietor data:
    wb = xlrd.open_workbook(_DDCT_IN_PATH)
    ws = wb.sheet_by_index(0)
    cross = pd.read_csv(_DDCT_IN_CROSS_PATH)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0,0], True)
    pos2 = naics.search_ws(ws, _DDCT_COL1, 20)
    pos3 = naics.search_ws(ws,_DDCT_COL2, 20,
                           True, np.array(pos2) + np.array([0,1]))
    #
    data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM])
    #
    cross_index = cross.shape[0]-1
    enum_index = len(data_tree.enum_inds)-1
    for i in xrange(pos1[0],ws.nrows):
        cur_cell = str(ws.cell_value(i,pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cross.shape[0]):
            cross_index = (cross_index+1) % cross.shape[0]
            cur_ind_name = str(cross.iloc[cross_index,0]).lower().strip()
            if(cur_cell == cur_ind_name):
                if pd.isnull(cross.iloc[cross_index,1]):
                    continue
                ind_codes = str(cross.iloc[cross_index,1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index+1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs[_CODE_DF_NM]
                    cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM]
                    cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions 
                                        * (ws.cell_value(i,pos2[1]) 
                                        + ws.cell_value(i,pos3[1])))
            if(tot_proportions == 1):
                break
    # Default:
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

예제 #8

0

파일 보기

파일: pull_soi.py 프로젝트: evan-magnusson/dynamic

def load_soi_prop_data(data_tree = None, blue_tree = None, blueprint = None):
    #
    prop_fctr = 10**3
    #
    if data_tree == None:
        data_tree = naics.generate_tree()
    # Finding the "\**sp01br" file in the proprietorships folder:
    for i in os.listdir(prop_dir):
        if(i[2:] == "sp01br.xls"):
            sp01br_file = os.path.abspath(prop_dir + "\\" + i)
        if(i[2:] == "sp01br_Crosswalk.csv"):
            sp01br_cross_file = os.path.abspath(prop_dir + "\\" + i)
    # Opening nonfarm proprietor data:
    cur_wb = xlrd.open_workbook(sp01br_file)
    cur_ws = cur_wb.sheet_by_index(0)
    cur_cross = pd.read_csv(sp01br_cross_file)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(cur_ws,"Industrial sector",20, True, [0,0], True)
    pos2 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20)
    pos3 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20,
                         True, np.array(pos2) + np.array([0,1]))
    #
    for i in data_tree.enum_inds:
        i.append_dfs(("soi_prop", pd.DataFrame(np.zeros((1,1)),
                                    columns = ["Depr Deductions"])))
    #
    cross_index = cur_cross.shape[0]-1
    enum_index = len(data_tree.enum_inds)-1
    for i in xrange(pos1[0],cur_ws.nrows):
        cur_cell = str(cur_ws.cell_value(i,pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cur_cross.shape[0]):
            cross_index = (cross_index+1) % cur_cross.shape[0]
            cur_ind_name = str(cur_cross.iloc[cross_index,0]).lower().strip()
            if(cur_cell == cur_ind_name):
                if pd.isnull(cur_cross.iloc[cross_index,1]):
                    continue
                ind_codes = str(cur_cross.iloc[cross_index,1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index+1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs["Codes:"]
                    #
                    #print ind_codes
                    #print cur_codes
                    cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs["soi_prop"]["Depr Deductions"]
                    cur_dfs[0] += (prop_fctr * cur_proportions 
                                        * (cur_ws.cell_value(i,pos2[1]) 
                                        + cur_ws.cell_value(i,pos3[1])))
            if(tot_proportions == 1):
                break
    # Default:
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["soi_prop"])
    naics.pop_forward(tree=data_tree, df_list=["soi_prop"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

예제 #9

0

파일 보기

파일: read_bea.py 프로젝트: evan-magnusson/dynamic

def read_bea(output_tree, data_folder):
    # The directory with BEA data:
    bea_folder = os.path.abspath(data_folder + "\\BEA")
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(os.path.abspath(
                                    bea_folder + "\\detailnonres_stk1.xlsx"))
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if(sht_pos == [-1,-1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True)
        sht_pos[1] = sht_pos[1] - 1
    if(sht_pos == [-1,-1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts-1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index-1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.abspath(bea_folder + "\\detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ")
        asset_list.iloc[i,0] = asset_list.iloc[i,0].strip()
    
    # Reading in the corresponding naics codes:
    naics_file = os.path.abspath(bea_folder + "\\detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry","BEA Code","NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object),
                             columns = chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    for i in range(0, num_shts-2):
        for cur_row in range(sht_pos[0]+1, bea_readme.nrows):
            bea_code = str(bea_readme.cell_value(cur_row,cur_col+1))
            if(str(bea_codes1[i]) == bea_code):
                bea_ind = str(bea_readme.cell_value(cur_row,cur_col))
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_inds[i] = bea_ind
                bea_chart["BEA Code"][i] = bea_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter+1) % num_naics
                    if(naics_inds[naics_counter] == bea_chart["Industry"][i]):
                       bea_naics[i] = naics_cross["NAICS"][naics_counter]
                       break
                break
            # If they match except one has ".0" at the end:
            elif(str(bea_codes1[i]) == 
                    str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]):
                bea_ind = str(bea_readme.cell_value(cur_row, cur_col))
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_chart["Industry"][i] = bea_ind
                cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]
                bea_chart["BEA Code"][i] = cur_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter+1) % num_naics
                    if(naics_inds[naics_counter] == bea_inds[i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros((asset_list.shape[0],
                                       bea_chart.shape[0])), 
                             columns = bea_chart["BEA Code"])
    # For each industry, calculating 
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j,0]
            for k in xrange(sht_pos[0]+2, cur_sht.nrows):
                cur_cell = str(cur_sht.cell_value(k, sht_pos[1]+1))
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if(cur_asset == cur_cell):
                    bea_table[i][j] = float(
                                        cur_sht.cell_value(k, cur_sht.ncols-1)
                                        )
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * 1000000
    # Breaking down by corporate tax status:
    corp_types = ["C Corporations",
                  "Corporate general partners", 
                  "Corporate limited partners"]
    non_corp_types = ["S Corporations",
                      "Individual general partners",
                      "Individual limited partners",
                      "Partnership general partners",
                      "Partnership limited partners",
                      "Tax-exempt organization general partners",
                      "Tax-exempt organization limited partners",
                      "Nominee and other general partners", 
                      "Nominee and other limited partners",
                      "Sole Proprietors"]
    # Initialize tree for assets data:
    asset_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    for i in xrange(0, len(asset_tree.enum_inds)):
        asset_tree.enum_inds[i].data.append(("All", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])), 
                             columns = asset_list.iloc[:,0])))
        asset_tree.enum_inds[i].data.append(("Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
        asset_tree.enum_inds[i].data.append(("Non-Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(output_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, output_tree, 
                                          "FA").iloc[1,:]
        corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", 
                                           corp_types).iloc[1,:]
        non_corp_proportions = naics.get_proportions(cur_codes, output_tree, 
                                               "FA", non_corp_types).iloc[1,:]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(asset_tree.enum_inds)):
                enum_index = (enum_index+1) % len(asset_tree.enum_inds)
                out_dfs = output_tree.enum_inds[enum_index].data.dfs
                if(sum(out_dfs["FA"].iloc[0,:]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in corp_types:
                    corp_ratio += (out_dfs["FA"][category][0]/
                                        sum(out_dfs["FA"].iloc[0,:]))
                for category in non_corp_types:
                    non_corp_ratio += (out_dfs["FA"][category][0]/
                                            sum(out_dfs["FA"].iloc[0,:]))
                cur_data = asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:,0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if(share == 0):
                    continue
                num_assets = asset_tree.enum_inds[0].data.dfs["All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            all_ratio*
                                            all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            corp_ratio*
                                            corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            non_corp_ratio*
                                            non_corp_proportions[code_index])
                break
            if(tot_share == 1):
                break
    return asset_tree

예제 #10

0

파일 보기

파일: read_bea.py 프로젝트: talumbau/B-Tax

def read_bea(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(_BEA_ASSET_PATH)
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if(sht_pos == [-1,-1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True)
        sht_pos[1] = sht_pos[1] - 1
    if(sht_pos == [-1,-1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
        if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""):    
       # for rownum in xrange(sh.nrows):
    #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)])    
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts-1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index-1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ")
        asset_list.iloc[i,0] = asset_list.iloc[i,0].strip()
    # Reading in the corresponding naics codes:
    naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry","BEA Code","NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object),
                             columns = chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    #for i in range(0, num_shts-2):
    i = 0
    for cur_row in range(sht_pos[0]+1, bea_readme.nrows):
        bea_code = unicode(bea_readme.cell_value(cur_row,cur_col+1)).encode('utf8')
        if(str(bea_codes1[i]) == bea_code):
            bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8')
            bea_ind = bea_ind.replace('\xa0', ' ').strip()
            bea_ind = bea_ind.replace('\xc2', '').strip()
            bea_inds[i] = bea_ind
            bea_chart["BEA Code"][i] = bea_code
            for k in xrange(0, num_naics):
                naics_counter = (naics_counter+1) % num_naics
                if(naics_inds[naics_counter] == bea_chart["Industry"][i]):
                   bea_naics[i] = naics_cross["NAICS"][naics_counter]
                   i += 1
                   break
        # If they match except one has ".0" at the end:
        elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]):
            bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8')
            bea_ind = bea_ind.replace('\xa0', ' ').strip()
            bea_ind = bea_ind.replace('\xc2', '').strip()
            bea_chart["Industry"][i] = bea_ind
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]
            bea_chart["BEA Code"][i] = cur_code
            for k in xrange(0, num_naics):
                naics_counter = (naics_counter+1) % num_naics
                if(naics_inds[naics_counter] == bea_inds[i]):
                    bea_naics[i] = naics_cross["NAICS"][naics_counter]
                    i += 1
                    break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros((asset_list.shape[0],
                                       bea_chart.shape[0])), 
                             columns = bea_chart["BEA Code"])
    # For each industry, calculating 
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j,0]
            for k in xrange(sht_pos[0]+2, cur_sht.nrows):
                cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1]+1)).encode('utf8')
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if(cur_asset == cur_cell):
                    bea_table[i][j] = float(cur_sht.cell_value(k, cur_sht.ncols-1))
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * _BEA_IN_FILE_FCTR
    # Initialize tree for assets data:
    fixed_asset_tree = naics.generate_tree()
    for i in xrange(0, len(fixed_asset_tree.enum_inds)):
        fixed_asset_tree.enum_inds[i].data.append(("All", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])), 
                             columns = asset_list.iloc[:,0])))
        fixed_asset_tree.enum_inds[i].data.append(("Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
        fixed_asset_tree.enum_inds[i].data.append(("Non-Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(asset_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, asset_tree, 
                                          "FA").iloc[1,:]
        corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", 
                                           _CORP_NMS).iloc[1,:]
        non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, 
                                               "FA", _NCORP_NMS).iloc[1,:]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(fixed_asset_tree.enum_inds)):
                enum_index = (enum_index+1) % len(fixed_asset_tree.enum_inds)
                out_dfs = asset_tree.enum_inds[enum_index].data.dfs
                if(sum(out_dfs["FA"].iloc[0,:]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in _CORP_NMS:
                    corp_ratio += (out_dfs["FA"][category][0]/
                                        sum(out_dfs["FA"].iloc[0,:]))
                for category in _NCORP_NMS:
                    non_corp_ratio += (out_dfs["FA"][category][0]/
                                            sum(out_dfs["FA"].iloc[0, :]))
                cur_data = fixed_asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:,0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if(share == 0):
                    continue
                num_assets = fixed_asset_tree.enum_inds[0].data.dfs["All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            all_ratio*
                                            all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            corp_ratio*
                                            corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            non_corp_ratio*
                                            non_corp_proportions[code_index])
                break
            if(tot_share == 1):
                break
    #
    naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"])
    naics.pop_forward(tree=fixed_asset_tree, df_list=["All"],
                      blueprint="FA", blue_tree=asset_tree)
    naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"],
                      blueprint="FA", blue_tree=asset_tree,
                      sub_print=_CORP_NMS)
    naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"],
                      blueprint="FA", blue_tree=asset_tree, 
                      sub_print=_NCORP_NMS)
    return fixed_asset_tree

예제 #11

0

파일 보기

파일: pull_soi_proprietorship.py 프로젝트: salimfurth/OG-USA

def load_soi_nonfarm_prop(data_tree=naics.generate_tree(),
                          blue_tree=None,
                          blueprint=None,
                          from_out=False,
                          out_path=_NFARM_PROP_OUT_PATH):
    """ This function loads the soi nonfarm proprietorship data:
    
    :param data_tree: The NAICS tree to read the data into.
    :param cols_dict: A dictionary mapping dataframe columns to the name of
           the column names in the input file
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    :param output_path: The path of the output file.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree)
        return data_tree
    # Opening nonfarm proprietor data:
    wb = xlrd.open_workbook(_DDCT_IN_PATH)
    ws = wb.sheet_by_index(0)
    cross = pd.read_csv(_DDCT_IN_CROSS_PATH)
    # Finding the relevant positions in worksheet:
    pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0, 0], True)
    pos2 = naics.search_ws(ws, _DDCT_COL1, 20)
    pos3 = naics.search_ws(ws, _DDCT_COL2, 20, True,
                           np.array(pos2) + np.array([0, 1]))
    #
    data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM])
    #
    cross_index = cross.shape[0] - 1
    enum_index = len(data_tree.enum_inds) - 1
    for i in xrange(pos1[0], ws.nrows):
        cur_cell = str(ws.cell_value(i, pos1[1])).lower().strip()
        #
        tot_proportions = 0
        for j in xrange(0, cross.shape[0]):
            cross_index = (cross_index + 1) % cross.shape[0]
            cur_ind_name = str(cross.iloc[cross_index, 0]).lower().strip()
            if (cur_cell == cur_ind_name):
                if pd.isnull(cross.iloc[cross_index, 1]):
                    continue
                ind_codes = str(cross.iloc[cross_index, 1]).split(".")
                for k in xrange(0, len(data_tree.enum_inds)):
                    enum_index = (enum_index + 1) % len(data_tree.enum_inds)
                    cur_data = data_tree.enum_inds[enum_index].data
                    cur_codes = cur_data.dfs[_CODE_DF_NM]
                    cur_proportions = naics.compare_codes(
                        ind_codes, cur_codes.iloc[:, 0])
                    if cur_proportions == 0:
                        continue
                    tot_proportions += cur_proportions
                    cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM]
                    cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions *
                                   (ws.cell_value(i, pos2[1]) +
                                    ws.cell_value(i, pos3[1])))
            if (tot_proportions == 1):
                break
    # Default:
    if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[
            0].data.dfs.keys():
        blueprint = _TOT_CORP_DF_NM
    naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM])
    naics.pop_forward(tree=data_tree,
                      df_list=[_NFARM_DF_NM],
                      blueprint=blueprint,
                      blue_tree=blue_tree)
    #
    return data_tree

예제 #12

0

파일 보기

파일: read_inventories.py 프로젝트: evan-magnusson/dynamic

def read_inventories(output_tree, data_folder):
    # The directory with inventory data:
    inv_folder = os.path.abspath(data_folder + "\\Inventories")
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(os.path.abspath(
                                    inv_folder + "\\Inventories.xls"))
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if(cur_index[1] != check_index[1]):
        print "ERROR"
    # Breaking down by corporate tax status:
    corp_types = ["C Corporations",
                  "Corporate general partners", 
                  "Corporate limited partners"]
    non_corp_types = ["S Corporations",
                      "Individual general partners",
                      "Individual limited partners",
                      "Partnership general partners",
                      "Partnership limited partners",
                      "Tax-exempt organization general partners",
                      "Tax-exempt organization limited partners",
                      "Nominee and other general partners", 
                      "Nominee and other limited partners",
                      "Sole Proprietors"]
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(os.path.abspath(
                                inv_folder + "\\Inventories_Crosswalk.csv"))
    # Creating a tree for the inventory data:
    inv_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    #
    data_cols = ["All", "Corp", "Non-Corp"]
    for i in inv_tree.enum_inds:
        i.data.append(("Inventories",
                       pd.DataFrame(np.zeros((1, len(data_cols))), 
                                    columns = data_cols)))
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if(cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and 
                    (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if(checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols-1))
            except ValueError:
                continue
            inv_data[cross_index-1] = cur_value
            # Data is in billions:
            inv_data[cross_index-1] = (10**9) * inv_data[cross_index-1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, output_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_ind = output_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_df = prev_ind.data.dfs["INV"]
            if(sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) *
                                (inv_data[i] * proportions.iloc[1,j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0,:])
                for k in corp_types:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in non_corp_types:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    return inv_tree

예제 #13

0

파일 보기

파일: read_bea.py 프로젝트: salimfurth/OG-USA

def read_bea(asset_tree):
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(_BEA_ASSET_PATH)
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if (sht_pos == [-1, -1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0, 0],
                                  True)
        sht_pos[1] = sht_pos[1] - 1
    if (sht_pos == [-1, -1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
        if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') !=
                ""):
            # for rownum in xrange(sh.nrows):
            #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)])
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts - 1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index - 1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') !=
                ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i, 0] = asset_list.iloc[i, 0].replace("\xa0", " ")
        asset_list.iloc[i, 0] = asset_list.iloc[i, 0].strip()
    # Reading in the corresponding naics codes:
    naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry", "BEA Code", "NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts - 2, 3), dtype=object),
                             columns=chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    for i in range(0, num_shts - 2):
        for cur_row in range(sht_pos[0] + 1, bea_readme.nrows):
            bea_code = unicode(bea_readme.cell_value(cur_row, cur_col +
                                                     1)).encode('utf8')
            if (str(bea_codes1[i]) == bea_code):
                bea_ind = unicode(bea_readme.cell_value(
                    cur_row, cur_col)).encode('utf8')
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_inds[i] = bea_ind
                bea_chart["BEA Code"][i] = bea_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter + 1) % num_naics
                    if (naics_inds[naics_counter] == bea_chart["Industry"][i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
            # If they match except one has ".0" at the end:
            elif (str(bea_codes1[i]) == str(
                    bea_readme.cell_value(cur_row, cur_col + 1))[:-2]):
                bea_ind = unicode(bea_readme.cell_value(
                    cur_row, cur_col)).encode('utf8')
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_chart["Industry"][i] = bea_ind
                cur_code = str(bea_readme.cell_value(cur_row,
                                                     cur_col + 1))[:-2]
                bea_chart["BEA Code"][i] = cur_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter + 1) % num_naics
                    if (naics_inds[naics_counter] == bea_inds[i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros(
        (asset_list.shape[0], bea_chart.shape[0])),
                             columns=bea_chart["BEA Code"])
    # For each industry, calculating
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(
                0, len(asset_list)):  #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j, 0]
            for k in xrange(sht_pos[0] + 2, cur_sht.nrows):
                cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1] +
                                                      1)).encode('utf8')
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if (cur_asset == cur_cell):
                    bea_table[i][j] = float(
                        cur_sht.cell_value(k, cur_sht.ncols - 1))
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * _BEA_IN_FILE_FCTR
    # Initialize tree for assets data:
    fixed_asset_tree = naics.generate_tree()
    for i in xrange(0, len(fixed_asset_tree.enum_inds)):
        fixed_asset_tree.enum_inds[i].data.append(
            ("All",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
        fixed_asset_tree.enum_inds[i].data.append(
            ("Corp",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
        fixed_asset_tree.enum_inds[i].data.append(
            ("Non-Corp",
             pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                          columns=asset_list.iloc[:, 0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(asset_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, asset_tree,
                                                "FA").iloc[1, :]
        corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA",
                                                 _CORP_NMS).iloc[1, :]
        non_corp_proportions = naics.get_proportions(cur_codes, asset_tree,
                                                     "FA",
                                                     _NCORP_NMS).iloc[1, :]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(fixed_asset_tree.enum_inds)):
                enum_index = (enum_index + 1) % len(fixed_asset_tree.enum_inds)
                out_dfs = asset_tree.enum_inds[enum_index].data.dfs
                if (sum(out_dfs["FA"].iloc[0, :]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in _CORP_NMS:
                    corp_ratio += (out_dfs["FA"][category][0] /
                                   sum(out_dfs["FA"].iloc[0, :]))
                for category in _NCORP_NMS:
                    non_corp_ratio += (out_dfs["FA"][category][0] /
                                       sum(out_dfs["FA"].iloc[0, :]))
                cur_data = fixed_asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:, 0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if (share == 0):
                    continue
                num_assets = fixed_asset_tree.enum_inds[0].data.dfs[
                    "All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,
                                             k] = (bea_table.iloc[k, i] *
                                                   all_ratio *
                                                   all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0, k] = (
                        bea_table.iloc[k, i] * corp_ratio *
                        corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0, k] = (
                        bea_table.iloc[k, i] * non_corp_ratio *
                        non_corp_proportions[code_index])
                break
            if (tot_share == 1):
                break
    #
    naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"])
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["All"],
                      blueprint="FA",
                      blue_tree=asset_tree)
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["Corp"],
                      blueprint="FA",
                      blue_tree=asset_tree,
                      sub_print=_CORP_NMS)
    naics.pop_forward(tree=fixed_asset_tree,
                      df_list=["Non-Corp"],
                      blueprint="FA",
                      blue_tree=asset_tree,
                      sub_print=_NCORP_NMS)
    return fixed_asset_tree