Exemplo n.º 1
0
def sector_disaggregation_generalized(fbs, group_cols):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df
    :return: A FBS df with missing naics5 and naics6
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk_w_nonnaics()

    # for loop min length to 6 digits
    length = min(fbs['Sector'].apply(lambda x: len(x)).unique())
    # appends missing naics levels to df
    for i in range(length, 6):

        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = fbs[fbs['Sector'].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(Sector_tmp=df_subset['Sector'].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset = df_subset.loc[df_subset['Sector_tmp'].isin(sector_list)]
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        group_cols = [e for e in group_cols if e not in ('Sector')]
        group_cols.append('Sector_tmp')
        df_subset2 = df_subset.drop_duplicates(subset=group_cols,
                                               keep=False).reset_index(drop=True)
        # merge the naics cw
        new_naics = pd.merge(df_subset2, cw[[sector_merge, sector_add]],
                             how='left', left_on=['Sector_tmp'], right_on=[sector_merge])
        # add column counting the number of child naics associated with a parent
        new_naics = new_naics.assign(sector_count=new_naics.groupby(['Location', 'Sector_tmp'])['Sector_tmp'].transform('count'))
        # only keep the rows where the count is 1
        new_naics2 = new_naics[new_naics['sector_count'] == 1]
        del new_naics2['sector_count']
        # issue warning if rows with more than one child naics that get dropped - will need method of estimation
        missing_naics = new_naics[new_naics['sector_count'] > 1]
        if len(missing_naics) > 0:
            missing_naics = missing_naics[['Location', 'Sector']].values.tolist()
            log.warning('There is data at sector length ' + str(i) + ' that is lost at sector length ' + str(i+1) +
                        ' for ' + str(missing_naics))
        new_naics2 = new_naics2.rename(columns={sector_add: "ST"})
        new_naics2 = new_naics2.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics2 = new_naics2.drop(columns=["Sector", "Sector_tmp"])
        new_naics2 = new_naics2.rename(columns={"ST": "Sector"})
        # append new naics to df
        if len(new_naics2) > 1:
            fbs = pd.concat([fbs, new_naics2], sort=True)

    return fbs
Exemplo n.º 2
0
def naics_expansion(facility_NAICS):
    """ modeled after sector_disaggregation in flowbyfunctions, updates NAICS 
    to more granular sectors if there is only one naics at a lower level
    :param facility_NAICS: df of facilities from facility matcher with NAICS
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk_w_nonnaics()
    cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']]

    # subset the naics 4 and 5 columns
    cw4 = cw_load[['NAICS_4', 'NAICS_5']]
    cw4 = cw4.drop_duplicates(subset=['NAICS_4'],
                              keep=False).reset_index(drop=True)
    naics4 = cw4['NAICS_4'].values.tolist()

    # subset the naics 5 and 6 columns
    cw5 = cw_load[['NAICS_5', 'NAICS_6']]
    cw5 = cw5.drop_duplicates(subset=['NAICS_5'],
                              keep=False).reset_index(drop=True)
    naics5 = cw5['NAICS_5'].values.tolist()

    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(4, 6):
        if i == 4:
            sector_list = naics4
            sector_merge = "NAICS_4"
            sector_add = "NAICS_5"
        elif i == 5:
            sector_list = naics5
            sector_merge = "NAICS_5"
            sector_add = "NAICS_6"

        # subset df to NAICS with length = i
        df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply(
            lambda x: len(x) == i)]

        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))]

        # merge the naics cw
        new_naics = pd.merge(df_subset,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['NAICS'],
                             right_on=[sector_merge])
        # drop columns and rename new sector columns
        new_naics['NAICS'] = new_naics[sector_add]
        new_naics = new_naics.drop(columns=[sector_merge, sector_add])

        # drop records with NAICS that have now been expanded
        facility_NAICS = facility_NAICS[~facility_NAICS['NAICS'].
                                        isin(sector_list)]

        # append new naics to df
        facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True)

    return facility_NAICS
Exemplo n.º 3
0
def sector_disaggregation(sector_disaggregation, groupby_dict):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df
    :return: A FBS df with missing naics5 and naics6
    """

    # todo: modify by adding count column - only want to keep the 1:1 parent:child relationship

    sector_disaggregation = clean_df(sector_disaggregation, groupby_dict, fbs_fill_na_dict)

    # ensure None values are not strings
    sector_disaggregation['SectorConsumedBy'] = sector_disaggregation['SectorConsumedBy'].replace({'None': ""})
    sector_disaggregation['SectorProducedBy'] = sector_disaggregation['SectorProducedBy'].replace({'None': ""})

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk_w_nonnaics()

    # for loop min length to 6 digits
    length = sector_disaggregation[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).min().min()
    # appends missing naics levels to df
    for i in range(length, 6):

        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = sector_disaggregation.loc[sector_disaggregation[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                                              sector_disaggregation[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False)
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        df_subset = df_subset.drop_duplicates(subset=['Flowable', 'Context', 'Location', 'SectorProduced_tmp',
                                                      'SectorConsumed_tmp'], keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp",
                                            "SectorConsumed_tmp"])
        new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy",
                                              "SCB": "SectorConsumedBy"})
        # append new naics to df
        new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({'nan': ""})
        new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({'nan': ""})
        sector_disaggregation = pd.concat([sector_disaggregation, new_naics], sort=True)
    # replace blank strings with None
    sector_disaggregation = sector_disaggregation.replace({'': None})
    sector_disaggregation = sector_disaggregation.replace({np.nan: None})

    return sector_disaggregation
Exemplo n.º 4
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after datachecks.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS is not target_level
    :param df:
    :param target_sector_level:
    :return:
    """

    from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_sector_length_crosswalk_w_nonnaics()
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS to be allocated'
                        ': {}'.format(' '.join(map(str, sector_list))))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Exemplo n.º 5
0
def check_if_losing_sector_data(df, df_subset, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    In some instances, not all
    :param fbs:
    :return:
    """

    df = df.fillna(fbs_fill_na_dict)
    # exclude nonsectors
    df = df.replace({'nan': '', 'None': ''})

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())]
        # clean df
        rl = clean_df(rl, flow_by_sector_fields, fbs_fill_na_dict)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk_w_nonnaics()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS not represented in current '
                        'data subset: {}'.format(' '.join(map(str, rl_list))))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info(
            'No data loss from subsetting the dataframe by specified sector length'
        )
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level +
                 ' associated with the sectors previously being dropped')

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df_subset, rows_lost],
                               ignore_index=True,
                               sort=True)
    df_w_lost_data = df_w_lost_data.replace({'': None})

    return df_w_lost_data