示例#1
0
def write_naics_2012_crosswalk():
    """
    Create a NAICS 2 - 6 digit crosswalk
    :return:
    """

    # load the useeior mastercrosswalk subset to the naics timeseries
    cw_load = load_crosswalk('sector_timeseries')

    # load BEA codes that will act as NAICS
    house = load_crosswalk('household')
    govt = load_crosswalk('government')
    bea = pd.concat([house, govt], ignore_index=True).rename(
        columns={'Code': 'NAICS_2012_Code',
                 'NAICS_Level_to_Use_For': 'secLength'})
    bea = bea[['NAICS_2012_Code', 'secLength']]

    # extract naics 2012 code column and drop duplicates and empty cells
    cw = cw_load[['NAICS_2012_Code']].drop_duplicates()
    cw = replace_NoneType_with_empty_cells(cw)
    cw = cw[cw['NAICS_2012_Code'] != '']
    # also drop the existing household and government codes because not all
    # inclusive and does not conform to NAICS length standards
    cw = cw[~cw['NAICS_2012_Code'].str.startswith(
        tuple(['F0', 'S0']))].reset_index(drop=True)

    # add column of sector length
    cw['secLength'] = cw['NAICS_2012_Code'].apply(
        lambda x: f"NAICS_{str(len(x))}")
    # add bea codes subbing for NAICS
    cw2 = pd.concat([cw, bea], ignore_index=True)

    # create dictionary of dataframes
    d = dict(tuple(cw2.groupby('secLength')))

    for l in range(2, 9):
        d[f'NAICS_{l}'] = d[f'NAICS_{l}'][['NAICS_2012_Code']].reset_index(
            drop=True).rename(
            columns={'NAICS_2012_Code': f'NAICS_{l}'})

    naics_cw = d['NAICS_2']
    for l in range(3, 7):
        naics_cw = (d[f'NAICS_{l}'].assign(temp=d[f'NAICS_{l}'][
            f'NAICS_{l}'].str.extract(
            pat=f"({'|'.join(naics_cw[f'NAICS_{l-1}'])})")).merge(
            naics_cw, how='right', left_on='temp',
            right_on=f'NAICS_{l-1}',
            suffixes=['', '_y'])).drop(columns=['temp'])

    # reorder
    naics_cw = naics_cw.reindex(sorted(naics_cw.columns), axis=1)
    # save as csv
    naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False)
示例#2
0
def addSectorNames(df):
    """
    Add column to an FBS df with the sector names
    :param df: FBS df with singular "Sector" column
    :return: FBS df with new column of combined Sector and SectorNames
    """
    # load crosswalk and add names
    cw = load_crosswalk('sector_name')
    cw['SectorName'] = cw['NAICS_2012_Code'].map(
        str) + ' (' + cw['NAICS_2012_Name'] + ')'
    cw = cw.rename(columns={'NAICS_2012_Code': 'Sector'})

    df = df.merge(cw[['Sector', 'SectorName']], how='left')

    df = df.reset_index(drop=True)

    return df
示例#3
0
def check_if_activities_match_sectors(fba):
    """
    Checks if activities in flowbyactivity that appear to be like sectors
    are actually sectors
    :param fba: a flow by activity dataset
    :return: A list of activities not marching the default sector list or
        text indicating 100% match
    """
    # Get list of activities in a flowbyactivity file
    activities = []
    for f in fba_activity_fields:
        activities.extend(fba[f])

    # Get list of module default sectors
    flowsa_sector_list = list(
        load_crosswalk('sector_timeseries')[SECTOR_SOURCE_NAME])
    activities_missing_sectors = set(activities) - set(flowsa_sector_list)

    if len(activities_missing_sectors) > 0:
        vLog.debug("%s activities not matching sectors in default %s list",
                   str(len(activities_missing_sectors)), SECTOR_SOURCE_NAME)
        return activities_missing_sectors
示例#4
0
def assign_naics(df_load):
    """
    Function to assign NAICS codes to each dataframe activity
    :param df_load: df, a FlowByActivity subset that contains unique activity names
    :return: df with assigned Sector columns
    """
    cw_load = load_crosswalk('BEA')
    cw = cw_load[['BEA_2012_Detail_Code',
                  'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True)
    # drop all rows with naics >6
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 6)].reset_index(drop=True)

    df = pd.merge(df_load,
                  cw,
                  left_on='Activity',
                  right_on='BEA_2012_Detail_Code')
    df = df.drop(columns=["BEA_2012_Detail_Code"])
    df = df.rename(columns={"NAICS_2012_Code": "Sector"})
    df['SectorSourceName'] = 'NAICS_2012_Code'

    return df
示例#5
0
def melt_naics_crosswalk():
    """
    Create a melt version of the naics 07 to 17 crosswalk to map
    naics to naics 2012
    :return: df, naics crosswalk melted
    """
    # load the mastercroswalk and subset by sectorsourcename,
    # save values to list
    cw_load = load_crosswalk('sector_timeseries')

    # create melt table of possible 2007 and 2017 naics that can
    # be mapped to 2012
    cw_melt = cw_load.melt(id_vars='NAICS_2012_Code',
                           var_name='NAICS_year',
                           value_name='NAICS')
    # drop the naics year because not relevant for replacement purposes
    cw_replacement = cw_melt.dropna(how='any')
    cw_replacement = cw_replacement[['NAICS_2012_Code',
                                     'NAICS']].drop_duplicates()
    # drop rows where contents are equal
    cw_replacement = cw_replacement[
        cw_replacement['NAICS_2012_Code'] != cw_replacement['NAICS']]
    # drop rows where length > 6
    cw_replacement = cw_replacement[cw_replacement['NAICS_2012_Code'].apply(
        lambda x: len(x) < 7)].reset_index(drop=True)
    # order by naics 2012
    cw_replacement = cw_replacement.sort_values(['NAICS', 'NAICS_2012_Code'
                                                 ]).reset_index(drop=True)

    # create allocation ratios by determining number of
    # NAICS 2012 to other naics when not a 1:1 ratio
    cw_replacement_2 = cw_replacement.assign(
        naics_count=cw_replacement.groupby(
            ['NAICS'])['NAICS_2012_Code'].transform('count'))
    cw_replacement_2 = cw_replacement_2.assign(allocation_ratio=1 /
                                               cw_replacement_2['naics_count'])

    return cw_replacement_2
示例#6
0
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list):
    """
    From the USDA ERS MLU 2012 report:
    "Includes miscellaneous other uses, such as industrial and commercial
    sites in rural areas, cemeteries, golf courses, mining areas, quarry sites,
    marshes, swamps, sand dunes, bare rocks, deserts, tundra,
    rural residential, and other unclassified land. In this report,
    urban land is reported as a separate category."

    Mining data is calculated using a separate source = BLM PLS.
    Want to extract rural residential land area from total value of
    'Other Land'
    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior to the activity
                     set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # land in rural residential lots
    rural_res = get_area_of_rural_land_occupied_by_houses_2013()

    # household codes
    household = load_crosswalk('household')
    household = household['Code'].drop_duplicates().tolist()

    # in df, where sector is a personal expenditure value, and
    # location = 00000, replace with rural res value
    vLogDetailed.info('The only category for MLU other land use is rural land '
                      'occupation. All other land area in this category is '
                      'unassigned to sectors, resulting in unaccounted land '
                      'area.')
    df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household),
                                rural_res, df['FlowAmount'])

    return df
示例#7
0
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename):
    """
    Replace any non sectors with sectors.
    :param df_load: df with sector columns or sector-like activities
    :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code)
    :return: df, with non-sectors replaced with sectors
    """
    # from flowsa.flowbyfunctions import aggregator

    # drop NoneType
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # load the mastercroswalk and subset by sectorsourcename,
    # save values to list
    cw_load = load_crosswalk('sector_timeseries')
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    if 'SectorConsumedBy' in df:
        column_headers = ['SectorProducedBy', 'SectorConsumedBy']
    else:
        column_headers = ['ActivityProducedBy', 'ActivityConsumedBy']

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is
    # not in crosswalk list
    if len(non_naics) != 0:
        vLog.debug(
            'Checking if sectors represent a different '
            'NAICS year, if so, replace with %s', sectorsourcename)
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the sectorsourcename column,
            # use that value to replace sector in column c if value in
            # column c is in the non_naics list
            df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)),
                             df[sectorsourcename], df[c])
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df[sectorsourcename],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=[sectorsourcename, 'NAICS', 'allocation_ratio'])
        vLog.debug('Replaced NAICS with %s', sectorsourcename)

        # check if there are any sectors that are not in
        # the naics 2012 crosswalk
        vLog.debug('Check again for non NAICS 2012 Codes')
        nonsectors = check_if_sectors_are_naics(df, cw, column_headers)
        if len(nonsectors) != 0:
            vLog.debug('Dropping non-NAICS from dataframe')
            for c in column_headers:
                # drop rows where column value is in the nonnaics list
                df = df[~df[c].isin(nonsectors)]
        # aggregate data
        possible_column_headers = \
            ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability',
             'TemporalCorrelation', 'GeographicalCorrelation',
             'TechnologicalCorrelation', 'DataCollection', 'Description')
        # list of column headers to group aggregation by
        groupby_cols = [
            e for e in df.columns.values.tolist()
            if e not in possible_column_headers
        ]
        df = aggregator(df, groupby_cols)

    # drop rows where both SectorConsumedBy and SectorProducedBy NoneType
    if 'SectorConsumedBy' in df:
        df_drop = df[(df['SectorConsumedBy'].isnull())
                     & (df['SectorProducedBy'].isnull())]
        if len(df_drop) != 0:
            activities_dropped = pd.unique(
                df_drop[['ActivityConsumedBy',
                         'ActivityProducedBy']].values.ravel('K'))
            activities_dropped = list(
                filter(lambda x: x is not None, activities_dropped))
            vLog.debug('Dropping rows where the Activity columns contain %s',
                       ', '.join(activities_dropped))
        df = df[~((df['SectorConsumedBy'].isnull()) &
                  (df['SectorProducedBy'].isnull()))].reset_index(drop=True)
    else:
        df = df[~((df['ActivityConsumedBy'].isnull()) &
                  (df['ActivityProducedBy'].isnull()))].reset_index(drop=True)

    df = replace_strings_with_NoneType(df)

    return df
示例#8
0
# write_Crosswalk_BEA_2012_Detail.py (scripts)
# !/usr/bin/env python3
# coding=utf-8

"""
Create a crosswalk linking BEA to NAICS for 2012 Detail

"""
from flowsa.common import load_crosswalk
from flowsa.settings import datapath

if __name__ == '__main__':

    cw_load = load_crosswalk('BEA')
    cw = cw_load[['BEA_2012_Detail_Code',
                  'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True)
    # drop all rows with naics >6
    cw = cw[cw['NAICS_2012_Code'].apply(lambda x: len(str(x)) == 6)].reset_index(drop=True)

    df = cw.rename(columns={"NAICS_2012_Code": "Sector",
                            "BEA_2012_Detail_Code":"Activity"})
    df['SectorSourceName'] = 'NAICS_2012_Code'
    df['ActivitySourceName'] = 'BEA_2012_Detail_Code'
    df.dropna(subset=["Sector"], inplace=True)
    # assign sector type
    df['SectorType'] = None
    # sort df
    df = df.sort_values('Sector')
    # reset index
    df.reset_index(drop=True, inplace=True)
    # set order
示例#9
0
def update_naics_crosswalk():
    """
    update the useeior crosswalk with crosswalks created for
    flowsa datasets - want to add any NAICS > 6 digits

    Add NAICS 2002
    :return: df of NAICS that include any unofficial NAICS
    """

    # read useeior master crosswalk, subset NAICS columns
    naics_load = load_crosswalk('BEA')
    naics = naics_load[['NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code'
                        ]].drop_duplicates().reset_index(drop=True)
    # convert all rows to string
    naics = naics.astype(str)
    # ensure all None are NoneType
    naics = replace_strings_with_NoneType(naics)
    # drop rows where all None
    naics = naics.dropna(how='all')

    # drop naics > 6 in mastercrosswalk (all manufacturing) because unused
    # and slows functions
    naics = naics[naics['NAICS_2012_Code'].apply(
        lambda x: len(x) < 7)].reset_index(drop=True)

    # find any NAICS where length > 6 that are used for allocation purposes
    # and add to naics list
    missing_naics_df_list = []
    # read in all the crosswalk csv files (ends in toNAICS.csv)
    for file_name in glob.glob(
            datapath + "activitytosectormapping/" + 'NAICS_Crosswalk_*.csv'):
        # skip Statistics Canada GDP because not all sectors relevant
        if file_name != crosswalkpath + 'Crosswalk_StatCan_GDP_toNAICS.csv':
            df = pd.read_csv(file_name, low_memory=False, dtype=str)
            # convert all rows to string
            df = df.astype(str)
            # determine sector year
            naics_year = df['SectorSourceName'][0]
            if naics_year == 'nan':
                log.info(f'Missing SectorSourceName for {file_name}')
                continue
            # subset dataframe so only sector
            df = df[['Sector']]
            # trim whitespace and cast as string, rename column
            df['Sector'] = df['Sector'].astype(str).str.strip()
            df = df.rename(columns={'Sector': naics_year})
            # extract sector year column from master crosswalk
            df_naics = naics[[naics_year]]
            # find any NAICS that are in source crosswalk but not in
            # mastercrosswalk
            common = df.merge(df_naics, on=[naics_year, naics_year])
            missing_naics = df[(~df[naics_year].isin(common[naics_year]))]
            # extract sectors where len > 6 and that does not include a '-'
            missing_naics = missing_naics[missing_naics[naics_year].apply(
                lambda x: len(x) > 6)]
            if len(missing_naics) != 0:
                missing_naics = missing_naics[
                    ~missing_naics[naics_year].str.contains('-')]
                # append to df list
                missing_naics_df_list.append(missing_naics)
    # concat df list and drop duplications
    missing_naics_df = \
        pd.concat(missing_naics_df_list, ignore_index=True,
                  sort=False).drop_duplicates().reset_index(drop=True)
    # sort df
    missing_naics_df = missing_naics_df.sort_values(['NAICS_2012_Code'])
    missing_naics_df = missing_naics_df.reset_index(drop=True)

    # add missing naics to master naics crosswalk
    total_naics = naics.append(missing_naics_df, ignore_index=True)

    # sort df
    total_naics = total_naics.sort_values(
        ['NAICS_2012_Code', 'NAICS_2007_Code']).drop_duplicates()
    total_naics = total_naics[~total_naics['NAICS_2012_Code'].isin(
        ['None', 'unknown', 'nan', 'Unknown', np.nan])].reset_index(drop=True)

    # convert all columns to string
    total_naics = total_naics.astype(str)

    # add naics 2002
    naics_02 = load_naics_02_to_07_crosswalk()
    naics_cw = pd.merge(total_naics, naics_02, how='left')

    # ensure NoneType
    naics_cw = replace_strings_with_NoneType(naics_cw)

    # reorder
    naics_cw = naics_cw[['NAICS_2002_Code', 'NAICS_2007_Code',
                         'NAICS_2012_Code', 'NAICS_2017_Code']]

    # save as csv
    naics_cw.to_csv(datapath + "NAICS_Crosswalk_TimeSeries.csv", index=False)
示例#10
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after validation.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS
    is not target_level
    :param df: df
    :param target_sector_level: str, final sector level of FBS (ex. NAICS_6)
    :return: df with missing sector level data
    """

    from flowsa.dataclean import replace_NoneType_with_empty_cells
    from flowsa.dataclean import replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_crosswalk('sector_length')
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and
        # target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning(
                'Data found at %s digit NAICS to be allocated: '
                '{}'.format(' '.join(map(str, sector_list))), str(i))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each %s',
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
示例#11
0
def naics_expansion(facility_NAICS):
    """
    modeled after sector_disaggregation in flowbyfunctions, updates NAICS
    to more granular sectors if there is only one naics at a lower level
    :param facility_NAICS: df of facilities from facility matcher with NAICS
    :return: df
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')
    cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']]

    # subset the naics 4 and 5 columns
    cw4 = cw_load[['NAICS_4', 'NAICS_5']]
    cw4 = cw4.drop_duplicates(subset=['NAICS_4'],
                              keep=False).reset_index(drop=True)
    naics4 = cw4['NAICS_4'].values.tolist()

    # subset the naics 5 and 6 columns
    cw5 = cw_load[['NAICS_5', 'NAICS_6']]
    cw5 = cw5.drop_duplicates(subset=['NAICS_5'],
                              keep=False).reset_index(drop=True)
    naics5 = cw5['NAICS_5'].values.tolist()

    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(4, 6):
        if i == 4:
            sector_list = naics4
            sector_merge = "NAICS_4"
            sector_add = "NAICS_5"
        elif i == 5:
            sector_list = naics5
            sector_merge = "NAICS_5"
            sector_add = "NAICS_6"

        # subset df to NAICS with length = i
        df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply(
            lambda x: len(x) == i)]

        # subset the df to the rows where the tmp sector columns are
        # in naics list
        df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))]

        # merge the naics cw
        new_naics = pd.merge(df_subset,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['NAICS'],
                             right_on=[sector_merge])
        # drop columns and rename new sector columns
        new_naics['NAICS'] = new_naics[sector_add]
        new_naics = new_naics.drop(columns=[sector_merge, sector_add])

        # drop records with NAICS that have now been expanded
        facility_NAICS = facility_NAICS[~facility_NAICS['NAICS'].
                                        isin(sector_list)]

        # append new naics to df
        facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True)

    return facility_NAICS
示例#12
0
def assign_naics(df_load):
    """
    Function to assign NAICS codes to each dataframe activity
    :param df: df, a FlowByActivity subset that contains unique activity names
    :return: df with assigned Sector columns
    """

    cw_load = load_crosswalk('BEA')
    cw = cw_load[['BEA_2012_Detail_Code',
                  'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True)
    # least aggregate level that applies is 5 digits
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 6)].reset_index(drop=True)

    cw = cw.sort_values(['BEA_2012_Detail_Code', 'NAICS_2012_Code'])

    df = pd.merge(df_load,
                  cw,
                  left_on='Activity',
                  right_on='BEA_2012_Detail_Code',
                  how='left')
    df = df.drop(columns=["BEA_2012_Detail_Code"])
    df = df.rename(columns={"NAICS_2012_Code": "Sector"})

    # reset sector value for sand, gravel, clay
    df.loc[df['Activity'] == '212320', 'Sector'] = '212321'
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212320', '212322']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212320', '212324']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212320', '212325']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)

    df.loc[df['Activity'] == '212390', 'Sector'] = '212391'
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212390', '212392']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212390', '212393']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)
    df = df.append(pd.DataFrame(
        [['Blackhurst_IO', '212390', '212399']],
        columns=['ActivitySourceName', 'Activity', 'Sector']),
                   ignore_index=True,
                   sort=True)

    # drop two rows where Blackhurst's IO vectors do not align with the
    # NAICS to BEA mapping because a NAICS code is it's own activity rather
    # than a subset of an activity
    # 'iron ore mining' is it's own row
    df = df[~((df['Activity'] == '2122A0') & (df['Sector'] == '212210'))]
    # 'support activities for oil and gas operations" is its own row
    df = df[~((df['Activity'] == '21311A') & (df['Sector'] == '213112'))]

    # ensure the sector column has value for iron ore and support activities
    df.loc[df['Activity'] == '212210', 'Sector'] = '212210'
    df.loc[df['Activity'] == '213112', 'Sector'] = '213112'

    df['SectorSourceName'] = 'NAICS_2012_Code'

    return df
示例#13
0
def equally_allocate_parent_to_child_naics(df_load, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at
    target sector level.
    Equally allocate parent NAICS to child NAICS where child NAICS missing
    :param df_load: df, FBS format
    :param target_sector_level: str, target NAICS level for FBS output
    :return: df, with all child NAICS at target sector level
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df_load)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = \
            df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) &
                   (df[fbs_activity_fields[1]] == '')]
        df_x2 = \
            df.loc[(df[fbs_activity_fields[0]] == '') &
                   (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = \
            df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) &
                   (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        if len(df_x) > 0:
            # create df of i + 1 length
            df_y1 = df.loc[
                df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
                | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
            df_y2 = df.loc[
                df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
                & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
            df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

            # create temp sector columns in df y, that are i digits in length
            df_y.loc[:, 'spb_tmp'] = \
                df_y[fbs_activity_fields[0]].apply(lambda x: x[0:i])
            df_y.loc[:, 'scb_tmp'] = \
                df_y[fbs_activity_fields[1]].apply(lambda x: x[0:i])
            # don't modify household sector lengths or gov't transport
            df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

            # merge the two dfs
            if 'Context' in df_y.columns:
                merge_cols = [
                    'Class', 'Context', 'FlowType', 'Flowable', 'Location',
                    'LocationSystem', 'Unit', 'Year'
                ]
            else:
                merge_cols = [
                    'Class', 'FlowType', 'Location', 'LocationSystem', 'Unit',
                    'Year'
                ]

            df_m = pd.merge(df_x,
                            df_y[merge_cols + ['spb_tmp', 'scb_tmp']],
                            how='left',
                            left_on=merge_cols +
                            ['SectorProducedBy', 'SectorConsumedBy'],
                            right_on=merge_cols + ['spb_tmp', 'scb_tmp'])

            # extract the rows that are not disaggregated to more
            # specific naics
            rl = df_m[(df_m['scb_tmp'].isnull())
                      & (df_m['spb_tmp'].isnull())].reset_index(drop=True)
            # clean df
            rl = replace_strings_with_NoneType(rl)
            rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']]\
                .drop_duplicates().values.tolist()

            # match sectors with target sector length sectors
            # import cw and subset to current sector length
            # and target sector length
            cw_load = load_crosswalk('sector_length')
            nlength = list(sector_level_key.keys())[list(
                sector_level_key.values()).index(i)]
            cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
            # add column with counts
            cw['sector_count'] = \
                cw.groupby(nlength)[nlength].transform('count')

            # merge df & conditionally replace sector produced/consumed columns
            rl_m = pd.merge(rl,
                            cw,
                            how='left',
                            left_on=[fbs_activity_fields[0]],
                            right_on=[nlength])
            rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                     fbs_activity_fields[0]] = rl_m[target_sector_level]
            rl_m = rl_m.drop(columns=[nlength, target_sector_level])

            rl_m2 = pd.merge(rl_m,
                             cw,
                             how='left',
                             left_on=[fbs_activity_fields[1]],
                             right_on=[nlength])
            rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                      fbs_activity_fields[1]] = rl_m2[target_sector_level]
            rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

            # create one sector count column
            rl_m2['sector_count_x'] = \
                rl_m2['sector_count_x'].fillna(rl_m2['sector_count_y'])
            rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
            rl_m3 = rl_m3.drop(columns=['sector_count_y'])

            # calculate new flow amounts, based on sector count,
            # allocating equally to the new sector length codes
            rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
            rl_m3 = rl_m3.drop(columns=['sector_count', 'spb_tmp', 'scb_tmp'])

            # append to df
            if len(rl) != 0:
                vLogDetailed.warning(
                    'Data found at %s digit NAICS not '
                    'represented in current data subset: '
                    '{}'.format(' '.join(map(str, rl_list))), str(i))
                rows_lost = rows_lost.append(rl_m3, ignore_index=True)

    if len(rows_lost) != 0:
        vLogDetailed.info(
            'Allocating FlowAmounts equally to '
            'each %s associated with the sectors previously '
            'dropped', target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data
示例#14
0
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column,
                                                      groupcols):
    """
    Estimate data suppression, by equally allocating parent NAICS
    values to child NAICS
    :param df_load: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param groupcols: list, columns to group df by
    :return: df, with estimated suppressed data
    """
    df = sector_disaggregation(df_load)
    df = replace_NoneType_with_empty_cells(df)
    df = df[df[sector_column] != '']

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]
        # drop activity from groupby
        groupcols = [
            e for e in groupcols if e not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')
    cw_melt = cw_load.melt(id_vars=["NAICS_6"],
                           var_name="NAICS_Length",
                           value_name="NAICS_Match").drop(
                               columns=['NAICS_Length']).drop_duplicates()

    df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True)
    # merge the naics cw
    new_naics = pd.merge(df_sup,
                         cw_melt,
                         how='left',
                         left_on=[sector_column],
                         right_on=['NAICS_Match'])
    # drop rows where match is null because no additional naics to add
    new_naics = new_naics.dropna()
    new_naics[sector_column] = new_naics['NAICS_6'].copy()
    new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match'])

    # merge the new naics with the existing df, if data already
    # existed for a NAICS6, keep the original
    dfm = pd.merge(new_naics[groupcols],
                   df,
                   how='left',
                   on=groupcols,
                   indicator=True).query('_merge=="left_only"').drop('_merge',
                                                                     axis=1)
    dfm = replace_NoneType_with_empty_cells(dfm)
    dfm = dfm.fillna(0)
    df = pd.concat([df, dfm], sort=True, ignore_index=True)
    # add length column and subset the data
    # subtract out existing data at NAICS6 from total data
    # at a length where no suppressed data
    df = df.assign(secLength=df[sector_column].apply(lambda x: len(x)))

    # add column for each state of sector length where
    # there are no missing values
    df_sup = df_sup.assign(
        secLength=df_sup[sector_column].apply(lambda x: len(x)))
    df_sup2 = (df_sup.groupby(
        ['FlowName', 'Compartment',
         'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index(
             name='secLengthsup'))

    # merge the dfs and sub out the last sector lengths with
    # all data for each state drop states that don't have suppressed dat
    df1 = df.merge(df_sup2)

    df2 = df1[df1['secLength'] == 6].reset_index(drop=True)
    # determine sector to merge on
    df2.loc[:, 'mergeSec'] = df2.apply(
        lambda x: x[sector_column][:x['secLengthsup']], axis=1)

    sum_cols = [
        e for e in fba_default_grouping_fields
        if e not in ['ActivityConsumedBy', 'ActivityProducedBy']
    ]
    sum_cols.append('mergeSec')
    df2 = df2.assign(
        FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum'))
    # rename columns for the merge and define merge cols
    df2 = df2.rename(columns={
        sector_column: 'NewNAICS',
        'mergeSec': sector_column
    })
    # keep flows with 0 flow
    df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True)
    m_cols = groupcols + ['NewNAICS', 'FlowAlloc']
    # merge the two dfs
    dfe = df1.merge(df3[m_cols])
    # add count column used to divide the unallocated flows
    dfe = dfe.assign(
        secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count'))
    dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) /
                     dfe['secCount'])
    # reassign values and drop columns
    dfe = dfe.assign(FlowAmount=dfe['newFlow'])
    dfe[sector_column] = dfe['NewNAICS'].copy()
    dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow'])

    # new df with estimated naics6
    dfn = pd.concat([df, dfe], ignore_index=True)
    dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True)
    dfn2 = dfn2.drop(columns=['secLength'])

    dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields)

    # if activities are source-like, set col values as copies
    # of the sector columns
    if sector_like_activities:
        dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy'])
        dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy'])
        # reindex columns
        dff = dff.reindex(df_load.columns, axis=1)

    # replace null values
    dff = replace_strings_with_NoneType(dff).reset_index(drop=True)
    return dff
示例#15
0
def sector_disaggregation(df_load):
    """
    function to disaggregate sectors if there is only one
    naics at a lower level works for lower than naics 4
    :param df_load: A FBS df, must have sector columns
    :return: A FBS df with values for the missing naics5 and naics6
    """

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # determine if activities are sector-like, if aggregating
    # a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')

    # for loop min length to 6 digits, where min length cannot be less than 2
    fields_list = []
    for i in range(2):
        if not (df[fbs_activity_fields[i]] == "").all():
            fields_list.append(fbs_activity_fields[i])

    length = df[fields_list].apply(lambda x: x.str.len()).min().min()
    if length < 2:
        length = 2
    # appends missing naics levels to df
    for i in range(length, 6):
        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i + 1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value
        # in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge],
                                keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i)
            | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=df_subset[
            fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[
            fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns
        # are in naics list
        df_subset_1 = df_subset.loc[
            (df_subset['SectorProduced_tmp'].isin(sector_list))
            & (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (
            df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[
            (df_subset['SectorProduced_tmp'].isin(sector_list))
            & (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3],
                              sort=False)
        # drop all rows with duplicate temp values, as a less aggregated
        # naics exists list of column headers, that if exist in df, should
        # be aggregated using the weighted avg fxn
        possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context',
                                   'Compartment', 'Location', 'Year',
                                   'SectorProduced_tmp', 'SectorConsumed_tmp')
        # list of column headers that do exist in the df being subset
        cols_to_drop = [
            e for e in possible_column_headers
            if e in df_subset.columns.values.tolist()
        ]

        df_subset = df_subset.drop_duplicates(
            subset=cols_to_drop, keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['SectorProduced_tmp'],
                             right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['SectorConsumed_tmp'],
                             right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=[
            "SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp",
            "SectorConsumed_tmp"
        ])
        new_naics = new_naics.rename(columns={
            "SPB": "SectorProducedBy",
            "SCB": "SectorConsumedBy"
        })
        # append new naics to df
        new_naics['SectorConsumedBy'] = \
            new_naics['SectorConsumedBy'].replace({np.nan: ""})
        new_naics['SectorProducedBy'] = \
            new_naics['SectorProducedBy'].replace({np.nan: ""})
        new_naics = replace_NoneType_with_empty_cells(new_naics)
        df = pd.concat([df, new_naics], sort=True, ignore_index=True)
    # replace blank strings with None
    df = replace_strings_with_NoneType(df)

    # if activities are source-like, set col values
    # as copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)

    return df
示例#16
0
def convert_statcan_data_to_US_water_use(df, attr, download_FBA_if_missing):
    """
    Use Canadian GDP data to convert 3 digit canadian water use to us water
    use:
    - canadian gdp
    - us gdp
    :param df: df, FBA format
    :param attr: dictionary, attribute data from method yaml for activity set
    :param download_FBA_if_missing: bool, True if would like to download
        missing FBAs from Data Commons, False if FBAs should be generated
        locally
    :return: df, FBA format, flowamounts converted
    """

    # load Canadian GDP data
    gdp = load_fba_w_standardized_units(
        datasource='StatCan_GDP',
        year=attr['allocation_source_year'],
        flowclass='Money',
        download_FBA_if_missing=download_FBA_if_missing)

    # drop 31-33
    gdp = gdp[gdp['ActivityProducedBy'] != '31-33']
    gdp = gdp.rename(columns={"FlowAmount": "USD"})

    # check units before merge
    compare_df_units(df, gdp)
    # merge df
    df_m = pd.merge(df,
                    gdp[['USD', 'ActivityProducedBy']],
                    how='left',
                    left_on='ActivityConsumedBy',
                    right_on='ActivityProducedBy')
    df_m['USD'] = df_m['USD'].fillna(0)
    df_m = df_m.drop(columns=["ActivityProducedBy_y"])
    df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"})
    df_m = df_m[df_m['USD'] != 0]
    # # convert to kg/USD
    df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / df_m['USD']
    df_m.loc[:, 'Unit'] = 'kg/USD'

    df_m = df_m.drop(columns=["USD"])

    # convert Location to US
    df_m.loc[:, 'Location'] = US_FIPS
    df_m = assign_fips_location_system(df_m,
                                       str(attr['allocation_source_year']))

    # load us gdp
    # load Canadian GDP data
    us_gdp_load = load_fba_w_standardized_units(
        datasource='BEA_GDP_GrossOutput',
        year=attr['allocation_source_year'],
        flowclass='Money',
        download_FBA_if_missing=download_FBA_if_missing)

    # load bea crosswalk
    cw_load = load_crosswalk('BEA')
    cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates()
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True)

    # merge
    us_gdp = pd.merge(us_gdp_load,
                      cw,
                      how='left',
                      left_on='ActivityProducedBy',
                      right_on='BEA_2012_Detail_Code')
    us_gdp = us_gdp.drop(
        columns=['ActivityProducedBy', 'BEA_2012_Detail_Code'])
    # rename columns
    us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'})
    # agg by naics
    us_gdp = aggregator(us_gdp, fba_default_grouping_fields)
    us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'})

    # determine annual us water use
    df_m2 = pd.merge(df_m,
                     us_gdp[['ActivityProducedBy', 'us_gdp']],
                     how='left',
                     left_on='ActivityConsumedBy',
                     right_on='ActivityProducedBy')

    df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp'])
    df_m2.loc[:, 'Unit'] = 'kg'
    df_m2 = df_m2.rename(
        columns={'ActivityProducedBy_x': 'ActivityProducedBy'})
    df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp'])

    return df_m2