예제 #1
0
def check_if_activities_match_sectors(fba):
    """
    Checks if activities in flowbyactivity that appear to be like sectors are actually sectors
    :param fba: a flow by activity dataset
    :return: A list of activities not marching the default sector list or text indicating 100% match
    """
    # Get list of activities in a flowbyactivity file
    activities = []
    for f in fba_activity_fields:
        activities.extend(fba[f])
    #activities.remove("None")

    # Get list of module default sectors
    flowsa_sector_list = list(load_sector_crosswalk()[sector_source_name])
    activities_missing_sectors = set(activities) - set(flowsa_sector_list)

    if len(activities_missing_sectors) > 0:
        log.info(
            str(len(activities_missing_sectors)) +
            " activities not matching sectors in default " +
            sector_source_name + " list.")
        return activities_missing_sectors
    else:
        log.info("All activities match sectors in " + sector_source_name +
                 " list.")
        return None
예제 #2
0
def melt_naics_crosswalk():
    """
    Create a melt version of the naics 07 to 17 crosswalk to map naics to naics 2012
    :return:
    """

    # load the mastercroswalk and subset by sectorsourcename, save values to list
    cw_load = load_sector_crosswalk()

    # create melt table of possible 2007 and 2017 naics that can be mapped to 2012
    cw_melt = cw_load.melt(id_vars='NAICS_2012_Code',
                           var_name='NAICS_year',
                           value_name='NAICS')
    # drop the naics year because not relevant for replacement purposes
    cw_replacement = cw_melt.dropna(how='any')
    cw_replacement = cw_replacement[['NAICS_2012_Code',
                                     'NAICS']].drop_duplicates()
    # drop rows where contents are equal
    cw_replacement = cw_replacement[
        cw_replacement['NAICS_2012_Code'] != cw_replacement['NAICS']]
    # drop rows where length > 6
    cw_replacement = cw_replacement[cw_replacement['NAICS_2012_Code'].apply(
        lambda x: len(x) < 7)].reset_index(drop=True)
    # order by naics 2012
    cw_replacement = cw_replacement.sort_values(['NAICS', 'NAICS_2012_Code'
                                                 ]).reset_index(drop=True)

    # create allocation ratios by determining number of NAICS 2012 to other naics when not a 1:1 ratio
    cw_replacement_2 = cw_replacement.assign(
        naics_count=cw_replacement.groupby(
            ['NAICS'])['NAICS_2012_Code'].transform('count'))
    cw_replacement_2 = cw_replacement_2.assign(allocation_ratio=1 /
                                               cw_replacement_2['naics_count'])

    return cw_replacement_2
예제 #3
0
파일: mapping.py 프로젝트: bl-young/flowsa
def expand_naics_list(df, sectorsourcename):

    # load master crosswalk
    cw = load_sector_crosswalk()
    sectors = cw.loc[:, [sectorsourcename]]
    # Create mapping df that's just the sectors at first
    sectors = sectors.drop_duplicates().dropna()

    # fill null values
    df['Sector'] = df['Sector'].astype('str')

    naics_df = pd.DataFrame([])
    for i in df['Sector']:
        dig = len(str(i))
        n = sectors.loc[sectors[sectorsourcename].apply(
            lambda x: str(x[0:dig])) == i]
        n['Sector'] = i
        naics_df = naics_df.append(n)

    # merge df to retain activityname/sectortype info
    naics_expanded = df.merge(naics_df, how='left')
    # drop column of aggregated naics and rename column of disaggregated naics
    naics_expanded = naics_expanded.drop(columns=["Sector"])
    naics_expanded = naics_expanded.rename(
        columns={sectorsourcename: 'Sector'})
    # drop duplicates and rearrange df columns
    naics_expanded = naics_expanded.drop_duplicates()
    naics_expanded = naics_expanded[[
        'ActivitySourceName', 'Activity', 'Sector', 'SectorType'
    ]]

    return naics_expanded
예제 #4
0
def replace_naics_w_naics_2012(df, sectorsourcename):
    """
    Check if activity-like sectors are in fact sectors. Also works for the Sector column
    :return:
    """
    # test
    # df = mapping.copy()
    # drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # load the mastercroswalk and subset by sectorsourcename, save values to list
    cw_load = load_sector_crosswalk()
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    possible_column_headers = [
        'Sector', 'SectorProducedBy', 'SectorConsumedBy'
    ]
    # # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is not in crosswalk list
    if len(non_naics2012) != 0:
        log.info(
            'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012'
        )
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c
            df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code']
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df['NAICS_2012_Code'],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio'])
        log.info('Replaced NAICS with NAICS 2012 Codes')

        # check if there are any sectors that are not in the naics 2012 crosswalk
        log.info('Check again for non NAICS 2012 Codes')
        check_if_sectors_are_naics(df, cw, column_headers)

    else:
        log.info('No sectors require substitution')

    return df
예제 #5
0
def expand_naics_list(df, sectorsourcename):
    """
    Add disaggregated sectors to the crosswalks.
    :param df:
    :param sectorsourcename:
    :return:
    """

    # load master crosswalk
    cw = load_sector_crosswalk()
    sectors = cw.loc[:, [sectorsourcename]]
    # drop duplicates
    sectors = sectors.drop_duplicates().dropna()
    # add non-naics to sector list
    household = load_household_sector_codes()
    household = pd.DataFrame(household['Code'].drop_duplicates())
    household.columns = [sectorsourcename]
    sectors = sectors.append(
        household, sort=False).drop_duplicates().reset_index(drop=True)
    # drop rows that contain hyphenated sectors
    sectors = sectors[~sectors[sectorsourcename].str.
                      contains("-")].reset_index(drop=True)
    # Ensure 'None' not added to sectors
    sectors = sectors[sectors[sectorsourcename] != "None"]

    # create list of sectors that exist in original df, which, if created when expanding sector list cannot be added
    existing_sectors = df[['Sector']]
    existing_sectors = existing_sectors.drop_duplicates()

    naics_df = pd.DataFrame([])
    for i in existing_sectors['Sector']:
        dig = len(str(i))
        n = sectors.loc[sectors[sectorsourcename].apply(lambda x: x[0:dig]) ==
                        i]
        if len(n) != 0:
            n = n.assign(Sector=i)
            naics_df = naics_df.append(n)

    # merge df to retain activityname/sectortype info
    naics_expanded = df.merge(naics_df, how='left')
    # drop column of aggregated naics and rename column of disaggregated naics
    naics_expanded = naics_expanded.drop(columns=["Sector"])
    naics_expanded = naics_expanded.rename(
        columns={sectorsourcename: 'Sector'})
    # drop duplicates and rearrange df columns
    naics_expanded = naics_expanded.drop_duplicates()
    naics_expanded = naics_expanded[[
        'ActivitySourceName', 'Activity', 'Sector', 'SectorType'
    ]]

    return naics_expanded
예제 #6
0
def check_if_activities_match_sectors(fba):
    """
    Checks if activities in flowbyactivity that appear to be like sectors are actually sectors
    :param fba: a flow by activity dataset
    :return: A list of activities not marching the default sector list or text indicating 100% match
    """
    # Get list of activities in a flowbyactivity file
    activities = []
    for f in fba_activity_fields:
        activities.extend(fba[f])
    #activities.remove("None")

    # Get list of module default sectors
    flowsa_sector_list = list(load_sector_crosswalk()[SECTOR_SOURCE_NAME])
    activities_missing_sectors = set(activities) - set(flowsa_sector_list)

    if len(activities_missing_sectors) > 0:
        vLog.debug("%s activities not matching sectors in default %s list",
                   str(len(activities_missing_sectors)), SECTOR_SOURCE_NAME)
        return activities_missing_sectors
예제 #7
0
def add_sectors_to_flowbyactivity(flowbyactivity_df,
                                  sectorsourcename=sector_source_name):
    """
    Add Sectors from the Activity fields and mapped them to Sector from the crosswalk.
    No allocation is performed.
    :param flowbyactivity_df: A standard flowbyactivity data frame
    :param sectorsourcename: A sector source name, using package default
    :return: a df with activity fields mapped to 'sectors'
    """

    mappings = []

    # First check if source activities are NAICS like - if so make it into a mapping file

    cat = load_source_catalog()

    for s in pd.unique(flowbyactivity_df['SourceName']):
        src_info = cat[s]
        # read the pre-determined level of sector aggregation of each crosswalk from the source catalog
        levelofSectoragg = src_info['sector_aggregation_level']
        # if data are provided in NAICS format, use the mastercrosswalk
        if src_info['sector-like_activities']:
            cw = load_sector_crosswalk()
            sectors = cw.loc[:, [sector_source_name]]
            # Create mapping df that's just the sectors at first
            mapping = sectors.drop_duplicates()
            # Add the sector twice as activities so mapping is identical
            mapping = mapping.assign(Activity=sectors[sector_source_name])
            mapping = mapping.rename(columns={sector_source_name: "Sector"})
            # add columns so can run expand_naics_list_fxn
            # if sector-like_activities = True, missing columns, so add
            mapping['ActivitySourceName'] = s
            # tmp assignment
            mapping['SectorType'] = None
            # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
            if levelofSectoragg == 'aggregated':
                mapping = expand_naics_list(mapping, sectorsourcename)
        else:
            # if source data activities are text strings, call on the manually created source crosswalks
            mapping = get_activitytosector_mapping(s)
            # filter by SectorSourceName of interest
            mapping = mapping[mapping['SectorSourceName'] == sectorsourcename]
            # drop SectorSourceName
            mapping = mapping.drop(columns=['SectorSourceName'])
            # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
            if levelofSectoragg == 'aggregated':
                mapping = expand_naics_list(mapping, sectorsourcename)
        mappings.append(mapping)
    mappings_df = pd.concat(mappings, sort=False)
    # Merge in with flowbyactivity by
    flowbyactivity_wsector_df = flowbyactivity_df
    for k, v in activity_fields.items():
        sector_direction = k
        flowbyactivity_field = v[0]["flowbyactivity"]
        flowbysector_field = v[1]["flowbysector"]
        sector_type_field = sector_direction + 'SectorType'
        mappings_df_tmp = mappings_df.rename(
            columns={
                'Activity': flowbyactivity_field,
                'Sector': flowbysector_field,
                'SectorType': sector_type_field
            })
        # column doesn't exist for sector-like activities, so ignore if error occurs
        mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'],
                                               errors='ignore')
        # Merge them in. Critical this is a left merge to preserve all unmapped rows
        flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df,
                                             mappings_df_tmp,
                                             how='left',
                                             on=flowbyactivity_field)
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace(
        {np.nan: None})

    return flowbyactivity_wsector_df
예제 #8
0
def add_sectors_to_flowbyactivity(flowbyactivity_df,
                                  sectorsourcename=sector_source_name,
                                  **kwargs):
    """
    Add Sectors from the Activity fields and mapped them to Sector from the crosswalk.
    No allocation is performed.
    :param flowbyactivity_df: A standard flowbyactivity data frame
    :param sectorsourcename: A sector source name, using package default
    :param kwargs: option to include the parameter 'allocationmethod', which modifies function behavoir if = 'direct'
    :return: a df with activity fields mapped to 'sectors'
    """
    # First check if source activities are NAICS like - if so make it into a mapping file

    cat = load_source_catalog()

    # for s in pd.unique(flowbyactivity_df['SourceName']):
    s = pd.unique(flowbyactivity_df['SourceName'])[0]
    # load catalog info for source
    src_info = cat[s]
    # if activities are sector-like, check if need to modify mapping
    if 'modify_sector-like_activities' in src_info:
        modify_sector_like_activities = src_info[
            'modify_sector-like_activities']
    else:
        modify_sector_like_activities = False
    # read the pre-determined level of sector aggregation of each crosswalk from the source catalog
    levelofSectoragg = src_info['sector_aggregation_level']
    # if the FBS activity set is 'direct', overwrite the levelofsectoragg, or if specified in fxn call
    if kwargs != {}:
        if 'allocationmethod' in kwargs:
            if kwargs['allocationmethod'] == 'direct':
                levelofSectoragg = 'disaggregated'
        if 'overwrite_sectorlevel' in kwargs:
            levelofSectoragg = kwargs['overwrite_sectorlevel']
    # if data are provided in NAICS format, use the mastercrosswalk
    if src_info[
            'sector-like_activities'] and modify_sector_like_activities is False:
        cw = load_sector_crosswalk()
        sectors = cw.loc[:, [sector_source_name]]
        # Create mapping df that's just the sectors at first
        mapping = sectors.drop_duplicates()
        # Add the sector twice as activities so mapping is identical
        mapping = mapping.assign(Activity=sectors[sector_source_name])
        mapping = mapping.rename(columns={sector_source_name: "Sector"})
        # add columns so can run expand_naics_list_fxn
        # if sector-like_activities = True, missing columns, so add
        mapping['ActivitySourceName'] = s
        # tmp assignment
        mapping['SectorType'] = None
        # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    else:
        # if source data activities are text strings, or sector-like activities should be modified, \
        # call on the manually created source crosswalks
        mapping = get_activitytosector_mapping(s)
        # filter by SectorSourceName of interest
        mapping = mapping[mapping['SectorSourceName'] == sectorsourcename]
        # drop SectorSourceName
        mapping = mapping.drop(columns=['SectorSourceName'])
        # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    # Merge in with flowbyactivity by
    flowbyactivity_wsector_df = flowbyactivity_df
    for k, v in activity_fields.items():
        sector_direction = k
        flowbyactivity_field = v[0]["flowbyactivity"]
        flowbysector_field = v[1]["flowbysector"]
        sector_type_field = sector_direction + 'SectorType'
        mappings_df_tmp = mapping.rename(
            columns={
                'Activity': flowbyactivity_field,
                'Sector': flowbysector_field,
                'SectorType': sector_type_field
            })
        # column doesn't exist for sector-like activities, so ignore if error occurs
        mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'],
                                               errors='ignore')
        # Merge them in. Critical this is a left merge to preserve all unmapped rows
        flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df,
                                             mappings_df_tmp,
                                             how='left',
                                             on=flowbyactivity_field)
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace(
        {np.nan: None})
    # add sector source name
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.assign(
        SectorSourceName=sectorsourcename)

    # if activities are sector-like check that the sectors are in the crosswalk
    if src_info['sector-like_activities']:
        flowbyactivity_wsector_df = replace_naics_w_naics_2012(
            flowbyactivity_wsector_df, sectorsourcename)

    return flowbyactivity_wsector_df
예제 #9
0
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename):
    """
    Replace any non sectors with sectors.
    :param df_load: df with sector columns or sector-like activities
    :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code)
    :return: df, with non-sectors replaced with sectors
    """
    # from flowsa.flowbyfunctions import aggregator

    # drop NoneType
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # load the mastercroswalk and subset by sectorsourcename, save values to list
    cw_load = load_sector_crosswalk()
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    if 'SectorConsumedBy' in df:
        column_headers = ['SectorProducedBy', 'SectorConsumedBy']
    else:
        column_headers = ['ActivityProducedBy', 'ActivityConsumedBy']
    # # list of column headers that do exist in the df being aggregated
    # column_headers = [e for e in possible_column_headers if e in df.columns.values.tolist()]

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is not in crosswalk list
    if len(non_naics) != 0:
        vLog.debug(
            'Checking if sectors represent a different '
            'NAICS year, if so, replace with %s', sectorsourcename)
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the sectorsourcename column,
            # use that value to replace sector in column c if value in
            # column c is in the non_naics list
            df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)),
                             df[sectorsourcename], df[c])
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df[sectorsourcename],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=[sectorsourcename, 'NAICS', 'allocation_ratio'])
        vLog.debug('Replaced NAICS with %s', sectorsourcename)

        # check if there are any sectors that are not in the naics 2012 crosswalk
        vLog.debug('Check again for non NAICS 2012 Codes')
        nonsectors = check_if_sectors_are_naics(df, cw, column_headers)
        if len(nonsectors) != 0:
            vLog.debug('Dropping non-NAICS from dataframe')
            for c in column_headers:
                # drop rows where column value is in the nonnaics list
                df = df[~df[c].isin(nonsectors)]
        # aggregate data
        possible_column_headers = ('FlowAmount', 'Spread', 'Min', 'Max',
                                   'DataReliability', 'TemporalCorrelation',
                                   'GeographicalCorrelation',
                                   'TechnologicalCorrelation',
                                   'DataCollection', 'Description')
        # list of column headers to group aggregation by
        groupby_cols = [
            e for e in df.columns.values.tolist()
            if e not in possible_column_headers
        ]
        # groupby_cols = list(df.select_dtypes(include=['object']).columns)
        df = aggregator(df, groupby_cols)

    # drop rows where both SectorConsumedBy and SectorProducedBy NoneType
    if 'SectorConsumedBy' in df:
        df_drop = df[(df['SectorConsumedBy'].isnull())
                     & (df['SectorProducedBy'].isnull())]
        if len(df_drop) != 0:
            activities_dropped = pd.unique(
                df_drop[['ActivityConsumedBy',
                         'ActivityProducedBy']].values.ravel('K'))
            activities_dropped = list(
                filter(lambda x: x is not None, activities_dropped))
            vLog.debug('Dropping rows where the Activity columns contain %s',
                       ', '.join(activities_dropped))
        df = df[~((df['SectorConsumedBy'].isnull()) &
                  (df['SectorProducedBy'].isnull()))].reset_index(drop=True)
    else:
        df = df[~((df['ActivityConsumedBy'].isnull()) &
                  (df['ActivityProducedBy'].isnull()))].reset_index(drop=True)

    df = replace_strings_with_NoneType(df)

    return df