示例#1
0
def link_non_bls_naics_to_naics(df):
    """
    BLS has 6 digit naics that are not recognized by Census. Map bls data to naics
    :param df:
    :return:
    """

    # load sector crosswalk
    cw = load_sector_length_crosswalk()
    cw_sub = cw[['NAICS_6']].drop_duplicates()

    # subset the df to the 6 digit naics
    df_sub = df[df['Activity'].apply(lambda x: len(x) == 6)]

    # create a list of 6 digit bls activities that are not naics
    unmapped_df = pd.merge(
        df_sub,
        cw_sub,
        indicator=True,
        how='outer',
        left_on='Activity',
        right_on='NAICS_6').query('_merge=="left_only"').drop(
            '_merge', axis=1).reset_index(drop=True)
    act_list = unmapped_df['Activity'].values.tolist()

    # if in the activity list, the sector should be modified so last digit is 0

    df['Sector'] = np.where(df['Activity'].isin(act_list),
                            df['Activity'].apply(lambda x: x[0:5]) + '0',
                            df['Sector'])

    return df
示例#2
0
def naics_expansion(facility_NAICS):
    """ modeled after sector_disaggregation in flowbyfunctions, updates NAICS 
    to more granular sectors if there is only one naics at a lower level
    :param facility_NAICS: df of facilities from facility matcher with NAICS
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()
    cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']]

    # subset the naics 4 and 5 columns
    cw4 = cw_load[['NAICS_4', 'NAICS_5']]
    cw4 = cw4.drop_duplicates(subset=['NAICS_4'],
                              keep=False).reset_index(drop=True)
    naics4 = cw4['NAICS_4'].values.tolist()

    # subset the naics 5 and 6 columns
    cw5 = cw_load[['NAICS_5', 'NAICS_6']]
    cw5 = cw5.drop_duplicates(subset=['NAICS_5'],
                              keep=False).reset_index(drop=True)
    naics5 = cw5['NAICS_5'].values.tolist()

    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(4, 6):
        if i == 4:
            sector_list = naics4
            sector_merge = "NAICS_4"
            sector_add = "NAICS_5"
        elif i == 5:
            sector_list = naics5
            sector_merge = "NAICS_5"
            sector_add = "NAICS_6"

        # subset df to NAICS with length = i
        df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply(
            lambda x: len(x) == i)]

        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))]

        # merge the naics cw
        new_naics = pd.merge(df_subset,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['NAICS'],
                             right_on=[sector_merge])
        # drop columns and rename new sector columns
        new_naics['NAICS'] = new_naics[sector_add]
        new_naics = new_naics.drop(columns=[sector_merge, sector_add])

        # drop records with NAICS that have now been expanded
        facility_NAICS = facility_NAICS[~facility_NAICS['NAICS'].
                                        isin(sector_list)]

        # append new naics to df
        facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True)

    return facility_NAICS
示例#3
0
def sector_disaggregation_generalized(fbs):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df
    :return: A FBS df with missing naics5 and naics6
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()
    cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']]

    # subset the naics 4 and 5 columsn
    cw4 = cw_load[['NAICS_4', 'NAICS_5']]
    cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True)
    naics4 = cw4['NAICS_4'].values.tolist()

    # subset the naics 5 and 6 columsn
    cw5 = cw_load[['NAICS_5', 'NAICS_6']]
    cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True)
    naics5 = cw5['NAICS_5'].values.tolist()

    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(4, 6):

        if i == 4:
            sector_list = naics4
            sector_merge = "NAICS_4"
            sector_add = "NAICS_5"
        elif i == 5:
            sector_list = naics5
            sector_merge = "NAICS_5"
            sector_add = "NAICS_6"

        # subset df to sectors with length = i and length = i + 1
        df_subset = fbs.loc[fbs['Sector'].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset.loc[:, 'Sector_tmp'] = df_subset['Sector'].apply(lambda x: x[0:i])
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset = df_subset.loc[df_subset['Sector_tmp'].isin(sector_list)]
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        df_subset = df_subset.drop_duplicates(subset=['FlowName', 'Compartment', 'Location', 'Sector_tmp'],
                                              keep=False).reset_index(drop=True)
        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['Sector_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "ST"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["Sector", "Sector_tmp"])
        new_naics = new_naics.rename(columns={"ST": "Sector"})
        # append new naics to df
        fbs = pd.concat([fbs, new_naics], sort=True)

    return fbs
示例#4
0
def get_sector_list(sector_level):

    cw = load_sector_length_crosswalk()
    sector_list = cw[sector_level].unique().tolist()

    return sector_list
示例#5
0
def allocate_dropped_sector_data(df_load, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    Equally allocate parent NAICS to child NAICS where child NAICS missing
    :param df: df, FBS format
    :param target_sector_level: str, target NAICS level for FBS output
    :return: df, with all child NAICS at target sector level
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df_load)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths or gov't transport
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull())
                  & (df_m['spb_tmp'].isnull())].reset_index(drop=True)
        # clean df
        rl = replace_strings_with_NoneType(rl)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            vLogDetailed.warning(
                'Data found at %s digit NAICS not represented in current '
                'data subset: {}'.format(' '.join(map(str, rl_list))), str(i))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True)

    if len(rows_lost) != 0:
        vLogDetailed.info(
            'Allocating FlowAmounts equally to each %s associated with '
            'the sectors previously dropped', target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data
示例#6
0
def sector_disaggregation(sector_disaggregation):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df
    :return: A FBS df with missing naics5 and naics6
    """

    #todo: need to modify so works with either a fBA with sectors or a FBS because called on in a fxn \
    # that accepts either

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()
    cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']]

    # subset the naics 4 and 5 columsn
    cw4 = cw_load[['NAICS_4', 'NAICS_5']]
    cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True)
    naics4 = cw4['NAICS_4'].values.tolist()

    # subset the naics 5 and 6 columsn
    cw5 = cw_load[['NAICS_5', 'NAICS_6']]
    cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True)
    naics5 = cw5['NAICS_5'].values.tolist()

    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(4, 6):
        sector_disaggregation = clean_df(sector_disaggregation, flow_by_sector_fields, fbs_fill_na_dict)

        if i == 4:
            sector_list = naics4
            sector_merge = "NAICS_4"
            sector_add = "NAICS_5"
        elif i == 5:
            sector_list = naics5
            sector_merge = "NAICS_5"
            sector_add = "NAICS_6"

        # subset df to sectors with length = i and length = i + 1
        df_subset = sector_disaggregation.loc[sector_disaggregation[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                                              sector_disaggregation[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset.loc[:, 'SectorProduced_tmp'] = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i])
        df_subset.loc[:, 'SectorConsumed_tmp'] = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i])
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) |
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2], sort=False)
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        df_subset = df_subset.drop_duplicates(subset=['Flowable', 'Context', 'Location', 'SectorProduced_tmp',
                                                      'SectorConsumed_tmp'], keep=False).reset_index(drop=True)
        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp",
                                            "SectorConsumed_tmp"])
        new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy",
                                              "SCB": "SectorConsumedBy"})
        # append new naics to df
        sector_disaggregation = pd.concat([sector_disaggregation, new_naics], sort=True)

    return sector_disaggregation
示例#7
0
def sector_disaggregation(df):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df, must have sector columns
    :return: A FBS df with values for the missing naics5 and naics6
    """

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df)

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()

    # for loop min length to 6 digits, where min length cannot be less than 2
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).min().min()
    if length < 2:
        length = 2
    # appends missing naics levels to df
    for i in range(length, 6):
        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=
                                     df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=
                                     df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False)
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        # list of column headers, that if exist in df, should be
        # aggregated using the weighted avg fxn
        possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context',
                                   'Compartment', 'Location', 'Year',
                                   'SectorProduced_tmp', 'SectorConsumed_tmp')
        # list of column headers that do exist in the df being subset
        cols_to_drop = [e for e in possible_column_headers if e
                        in df_subset.columns.values.tolist()]

        df_subset = df_subset.drop_duplicates(subset=cols_to_drop,
                                              keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy",
                                            "SectorProduced_tmp", "SectorConsumed_tmp"])
        new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy",
                                              "SCB": "SectorConsumedBy"})
        # append new naics to df
        new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""})
        new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""})
        new_naics = replace_NoneType_with_empty_cells(new_naics)
        df = pd.concat([df, new_naics], sort=True)
    # replace blank strings with None
    df = replace_strings_with_NoneType(df)

    return df
示例#8
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
示例#9
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after datachecks.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS is not target_level
    :param df:
    :param target_sector_level:
    :return:
    """

    from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_sector_length_crosswalk()
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS to be allocated'
                        ': {}'.format(' '.join(map(str, sector_list))))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated