Exemplo n.º 1
0
def compare_FBS_results(fbs1_load, fbs2_load, ignore_metasources=False):
    """
    Compare a parquet on Data Commons to a parquet stored locally
    :param fbs1_load: df, fbs format
    :param fbs2_load: df, fbs format
    :param ignore_metasources: bool, True to compare fbs without
    matching metasources
    :return: df, comparison of the two dfs
    """
    import flowsa

    # load first file (must be saved locally)
    df1 = flowsa.getFlowBySector(fbs1_load).rename(
        columns={'FlowAmount': 'FlowAmount_fbs1'})
    df1 = replace_strings_with_NoneType(df1)
    # load second file (must be saved locally)
    df2 = flowsa.getFlowBySector(fbs2_load).rename(
        columns={'FlowAmount': 'FlowAmount_fbs2'})
    df2 = replace_strings_with_NoneType(df2)
    # compare df
    merge_cols = [
        'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy',
        'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit',
        'FlowType', 'Year', 'MetaSources'
    ]
    if ignore_metasources:
        merge_cols.remove('MetaSources')
    # check units
    compare_df_units(df1, df2)
    df_m = pd.merge(df1[merge_cols + ['FlowAmount_fbs1']],
                    df2[merge_cols + ['FlowAmount_fbs2']],
                    how='outer')
    df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_fbs2'] -
                       df_m['FlowAmount_fbs1'])
    df_m = df_m.assign(
        Percent_Diff=(df_m['FlowAmount_diff'] / df_m['FlowAmount_fbs1']) * 100)
    df_m = df_m[df_m['FlowAmount_diff'].apply(
        lambda x: round(abs(x), 2) != 0)].reset_index(drop=True)
    # if no differences, print, if differences, provide df subset
    if len(df_m) == 0:
        vLog.debug('No differences between dataframes')
    else:
        vLog.debug('Differences exist between dataframes')
        df_m = df_m.sort_values([
            'Location',
            'SectorProducedBy',
            'SectorConsumedBy',
            'Flowable',
            'Context',
        ]).reset_index(drop=True)

    return df_m
Exemplo n.º 2
0
def proportional_allocation_by_location(df):
    """
    Creates a proportional allocation based on all the most
    aggregated sectors within a location
    Ensure that sectors are at 2 digit level - can run sector_aggregation()
    prior to using this function
    :param df: df, includes sector columns
    :param sectorcolumn: str, sector column by which to base allocation
    :return: df, with 'FlowAmountRatio' column
    """

    # tmp drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # find the shortest length sector

    denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) |
                      (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))]
    denom_df = denom_df.assign(Denominator=denom_df['FlowAmount'].groupby(
        denom_df['Location']).transform('sum'))
    denom_df_2 = denom_df[[
        'Location', 'LocationSystem', 'Year', 'Denominator'
    ]].drop_duplicates()
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2, how='left')
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df[
        'FlowAmount'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index()

    # add nonetypes
    allocation_df = replace_strings_with_NoneType(allocation_df)

    return allocation_df
Exemplo n.º 3
0
def collapse_activity_fields(df):
    """
    The 'activityconsumedby' and 'activityproducedby' columns from the
    allocation dataset do not always align with
    the dataframe being allocated. Generalize the allocation activity column.
    :param df: df, FBA used to allocate another FBA
    :return: df, single Activity column
    """

    df = replace_strings_with_NoneType(df)

    activity_consumed_list = df['ActivityConsumedBy'].drop_duplicates().values.tolist()
    activity_produced_list = df['ActivityProducedBy'].drop_duplicates().values.tolist()

    # if an activity field column is all 'none', drop the column and
    # rename renaming activity columns to generalize
    if all(v is None for v in activity_consumed_list):
        df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy'])
        df = df.rename(columns={'ActivityProducedBy': 'Activity',
                                'SectorProducedBy': 'Sector'})
    elif all(v is None for v in activity_produced_list):
        df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy'])
        df = df.rename(columns={'ActivityConsumedBy': 'Activity',
                                'SectorConsumedBy': 'Sector'})
    else:
        log.error('Cannot generalize dataframe')

    # drop other columns
    df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType'])

    return df
Exemplo n.º 4
0
def equal_allocation(fba_load):
    """
    Allocate an Activity in a FBA equally to all mapped sectors.
    Function only works if all mapped sectors are the same length

    :param fba_load: df, FBA with activity columns mapped to sectors
    :return: df, with FlowAmount equally allocated to all mapped sectors
    """
    # create groupby cols by which to determine allocation
    fba_cols = fba_load.select_dtypes([object]).columns.to_list()
    groupcols = [
        e for e in fba_cols
        if e not in ['SectorProducedBy', 'SectorConsumedBy', 'Description']
    ]
    # create counts of rows
    df_count = fba_load.groupby(groupcols, as_index=False,
                                dropna=False).size().astype(str)
    df_count = replace_strings_with_NoneType(df_count)

    # merge dfs
    dfm = fba_load.merge(df_count, how='left')
    # calc new flowamounts
    dfm['FlowAmount'] = dfm['FlowAmount'] / dfm['size'].astype(int)
    dfm = dfm.drop(columns='size')

    return dfm
Exemplo n.º 5
0
def proportional_allocation_by_location_and_activity(df, sectorcolumn):
    """
    Creates a proportional allocation within each aggregated sector within a location
    :param df: df with sector columns
    :param sectorcolumn: str, sector column for which to create allocation ratios
    :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns
    """

    # tmp replace NoneTypes with empty cells
    df = replace_NoneType_with_empty_cells(df)

    # denominator summed from highest level of sector grouped by location
    short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique())
    # want to create denominator based on short_length
    denom_df = df.loc[df[sectorcolumn].apply(
        lambda x: len(x) == short_length)].reset_index(drop=True)
    grouping_cols = [
        e for e in [
            'FlowName', 'Location', 'Activity', 'ActivityConsumedBy',
            'ActivityProducedBy'
        ] if e in denom_df.columns.values.tolist()
    ]
    denom_df.loc[:, 'Denominator'] = denom_df.groupby(
        grouping_cols)['HelperFlow'].transform('sum')

    # list of column headers, that if exist in df, should be aggregated using the weighted avg fxn
    possible_column_headers = ('Location', 'LocationSystem', 'Year',
                               'Activity', 'ActivityConsumedBy',
                               'ActivityProducedBy')
    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers
        if e in denom_df.columns.values.tolist()
    ]
    merge_headers = column_headers.copy()
    column_headers.append('Denominator')
    # create subset of denominator values based on Locations and Activities
    denom_df_2 = denom_df[column_headers].drop_duplicates().reset_index(
        drop=True)
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2,
                             how='left',
                             left_on=merge_headers,
                             right_on=merge_headers)
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['HelperFlow'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index(
        drop=True)

    # fill empty cols with NoneType
    allocation_df = replace_strings_with_NoneType(allocation_df)
    # fill na values with 0
    allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0)

    return allocation_df
Exemplo n.º 6
0
def bls_clean_allocation_fba_w_sec_sat_table(df_w_sec, **kwargs):
    """
    clean up bls df with sectors by estimating suppresed data
    :param df_w_sec: df, FBA format BLS QCEW data
    :param kwargs: additional arguments can include 'attr', a
    dictionary of FBA method yaml parameters
    :return: df, BLS QCEW FBA with estimated suppressed data
    """
    df_w_sec = df_w_sec.reset_index(drop=True)
    df2 = add_missing_flow_by_fields(
        df_w_sec, flow_by_activity_mapped_wsec_fields).reset_index(drop=True)
    df3 = replace_strings_with_NoneType(df2)

    return df3
Exemplo n.º 7
0
def proportional_allocation(df, attr):
    """
    Creates a proportional allocation based on all the most
    aggregated sectors within a location
    Ensure that sectors are at 2 digit level - can run sector_aggregation()
    prior to using this function
    :param df: df, includes sector columns
    :param attr: dictionary, attributes for an activity set
    :return: df, with 'FlowAmountRatio' column
    """

    # tmp drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # determine if any additional columns beyond location and sector by which
    # to base allocation ratios
    if 'allocation_merge_columns' in attr:
        groupby_cols = ['Location'] + attr['allocation_merge_columns']
        denom_subset_cols = [
            'Location', 'LocationSystem', 'Year', 'Denominator'
        ] + attr['allocation_merge_columns']
    else:
        groupby_cols = ['Location']
        denom_subset_cols = [
            'Location', 'LocationSystem', 'Year', 'Denominator'
        ]

    denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) |
                      (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))]

    # generate denominator based on identified groupby cols
    denom_df = denom_df.assign(Denominator=denom_df.groupby(groupby_cols)
                               ['FlowAmount'].transform('sum'))
    # subset select columns by which to generate ratios
    denom_df_2 = denom_df[denom_subset_cols].drop_duplicates()
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2, how='left')
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['FlowAmount'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index()

    # add nonetypes
    allocation_df = replace_strings_with_NoneType(allocation_df)

    return allocation_df
Exemplo n.º 8
0
def aggregator(df, groupbycols):
    """
    Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and
    generate weighted average values based on FlowAmount values for numeric
    columns
    :param df: df, Either flowbyactivity or flowbysector
    :param groupbycols: list, Either flowbyactivity or flowbysector columns
    :return: df, with aggregated columns
    """

    # reset index
    df = df.reset_index(drop=True)
    # tmp replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    # drop columns with flowamount = 0
    df = df[df['FlowAmount'] != 0]

    # list of column headers, that if exist in df, should be
    # aggregated using the weighted avg fxn
    possible_column_headers = \
        ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation',
         'GeographicalCorrelation', 'TechnologicalCorrelation',
         'DataCollection')

    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']})

    # run through other columns creating weighted average
    for e in column_headers:
        df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols)

    df_dfg = df_dfg.reset_index()
    df_dfg.columns = df_dfg.columns.droplevel(level=1)

    # if datatypes are strings, ensure that Null values remain NoneType
    df_dfg = replace_strings_with_NoneType(df_dfg)

    return df_dfg
Exemplo n.º 9
0
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links
    to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data being disaggregated
    :param sector_column: str, the sector column on which to make df
                          modifications (SectorProducedBy or SectorConsumedBy)
    :return: df, the CoA cropland dataframe with disaggregated pastureland data
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) ==
                         '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))

        # load usda coa cropland naics
        df_f = load_fba_w_standardized_units(
            datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land')
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) ==
                        '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(
            df_f, sectorsourcename=method['target_sector_source'])
        # estimate suppressed data by equal allocation
        df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3,
                                        'USDA_CoA_Cropland_NAICS')
        # create proportional ratios
        group_cols = fba_wsec_default_grouping_fields
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df_f = allocate_by_sector(df_f, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # check units before merge
        compare_df_units(p, df_f)
        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']],
                     how='left',
                     left_on="Location_tmp",
                     right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=[
            'Location_tmp', sector_column +
            '_x', 'Location_y', 'FlowAmountRatio'
        ])
        df = df.rename(columns={
            sector_column + '_y': sector_column,
            "Location_x": 'Location'
        })

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(
            lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df],
                                 sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Exemplo n.º 10
0
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons or run locally
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = \
        load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                           df_year=attr['helper_source_year'],
                           flowclass=attr['helper_source_class'],
                           geoscale_from=attr['helper_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = \
        helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = \
        helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = \
        helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = \
        df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = \
        df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base '
                  'merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors,
    # depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and \
            (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = \
            df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                how='left',
                left_on=['Location_tmp', sector_col_to_merge],
                right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = \
            modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = \
            df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                              how='left',
                              left_on=[sector_col_to_merge],
                              right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location', 'Sector', 'HelperFlow']],
                left_on=['Location', sector_col_to_merge],
                right_on=['Location', 'Sector'],
                how='left')
        # load bea codes that sub for naics
        bea = return_bea_codes_used_as_naics()
        # replace sector column and helperflow value if the sector column to
        # merge is in the bea list to prevent dropped data
        modified_fba_allocation['Sector'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation[sector_col_to_merge],
                     modified_fba_allocation['Sector'])
        modified_fba_allocation['HelperFlow'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation['FlowAmount'],
                     modified_fba_allocation['HelperFlow'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = \
            replacement_values.rename(
                columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = \
            modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0,
        # so after multiplying, don't have incorrect value associated with
        # new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(
                modified_fba_allocation, sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                Denominator=modified_fba_allocation.groupby(
                    ['FlowName', 'ActivityConsumedBy', 'Location',
                     'disaggregate_flag'])['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                FlowAmount=modified_fba_allocation['FlowAmount'] *
                           modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                         'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = \
            sector_aggregation(modified_fba_allocation,
                               fba_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[
            modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] ==
                                'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(
                attr['allocation_source'], attr["scale_helper_results"])(
                modified_fba_allocation, attr,
                download_FBA_if_missing=download_FBA_if_missing)
    return modified_fba_allocation
Exemplo n.º 11
0
def allocate_dropped_sector_data(df_load, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    Equally allocate parent NAICS to child NAICS where child NAICS missing
    :param df: df, FBS format
    :param target_sector_level: str, target NAICS level for FBS output
    :return: df, with all child NAICS at target sector level
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df_load)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths or gov't transport
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull())
                  & (df_m['spb_tmp'].isnull())].reset_index(drop=True)
        # clean df
        rl = replace_strings_with_NoneType(rl)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            vLogDetailed.warning(
                'Data found at %s digit NAICS not represented in current '
                'data subset: {}'.format(' '.join(map(str, rl_list))), str(i))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True)

    if len(rows_lost) != 0:
        vLogDetailed.info(
            'Allocating FlowAmounts equally to each %s associated with '
            'the sectors previously dropped', target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data
Exemplo n.º 12
0
def estimate_suppressed_data(df, sector_column, naics_level, sourcename):
    """
    Estimate data suppression, by equally allocating parent NAICS values to child NAICS
    :param df: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param naics_level: numeric, indicate at what NAICS length to base
                        estimated suppresed data off (2 - 5)
    :param sourcename: str, sourcename
    :return: df, with estimated suppressed data
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df)

    # find the longest length sector
    max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique())
    # loop through starting at naics_level, use most detailed level possible to save time
    for i in range(naics_level, max_length):
        # create df of i length
        df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # create df of i + 1 length
        df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)]
        # create temp sector columns in df y, that are i digits in length
        df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i]))

        # create list of location and temp activity combos that contain a 0
        missing_sectors_df = df_y[df_y['FlowAmount'] == 0]
        missing_sectors_list = missing_sectors_df[['Location',
                                                   's_tmp']].drop_duplicates().values.tolist()
        # subset the y df
        if len(missing_sectors_list) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows set conditions
            suppressed_list = []
            for q, r, in missing_sectors_list:
                c1 = df_y['Location'] == q
                c2 = df_y['s_tmp'] == r
                # subset data
                suppressed_list.append(df_y.loc[c1 & c2])
            suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True)
            # add column of existing allocated data for length of i
            suppressed_sectors['alloc_flow'] =\
                suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum')
            # subset further so only keep rows of 0 value
            suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0]
            # add count
            suppressed_sectors_sub = \
                suppressed_sectors_sub.assign(sector_count=
                                              suppressed_sectors_sub.groupby(
                                                  ['Location', 's_tmp']
                                              )['s_tmp'].transform('count'))

            # merge suppressed sector subset with df x
            df_m = pd.merge(df_x,
                            suppressed_sectors_sub[['Class', 'Compartment', 'FlowType',
                                                    'FlowName', 'Location', 'LocationSystem',
                                                    'Unit', 'Year', sector_column, 's_tmp',
                                                    'alloc_flow', 'sector_count']],
                            left_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                     'Location', 'LocationSystem', 'Unit', 'Year', sector_column],
                            right_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                      'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'],
                            how='right')
            # drop any rows where flowamount is none
            df_m = df_m[~df_m['FlowAmount'].isna()]
            # calculate estimated flows by subtracting the flow
            # amount already allocated from total flow of
            # sector one level up and divide by number of sectors with suppressed data
            df_m.loc[:, 'FlowAmount'] = \
                (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count']
            # only keep the suppressed sector subset activity columns
            df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count'])
            df_m = df_m.rename(columns={sector_column + '_y': sector_column})
            # reset activity columns
            if load_source_catalog()[sourcename]['sector-like_activities']:
                df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy'])
                df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy'])

            # drop the existing rows with suppressed data and append the new estimates from fba df
            modified_df =\
                pd.merge(df, df_m[['FlowName', 'Location', sector_column]],
                         indicator=True,
                         how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
            df = pd.concat([modified_df, df_m], ignore_index=True)
    df_w_estimated_data = replace_strings_with_NoneType(df)

    return df_w_estimated_data
Exemplo n.º 13
0
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column,
                                                      groupcols):
    """
    Estimate data suppression, by equally allocating parent NAICS
    values to child NAICS
    :param df_load: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param groupcols: list, columns to group df by
    :return: df, with estimated suppressed data
    """
    df = sector_disaggregation(df_load)
    df = replace_NoneType_with_empty_cells(df)
    df = df[df[sector_column] != '']

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]
        # drop activity from groupby
        groupcols = [
            e for e in groupcols if e not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')
    cw_melt = cw_load.melt(id_vars=["NAICS_6"],
                           var_name="NAICS_Length",
                           value_name="NAICS_Match").drop(
                               columns=['NAICS_Length']).drop_duplicates()

    df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True)
    # merge the naics cw
    new_naics = pd.merge(df_sup,
                         cw_melt,
                         how='left',
                         left_on=[sector_column],
                         right_on=['NAICS_Match'])
    # drop rows where match is null because no additional naics to add
    new_naics = new_naics.dropna()
    new_naics[sector_column] = new_naics['NAICS_6'].copy()
    new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match'])

    # merge the new naics with the existing df, if data already
    # existed for a NAICS6, keep the original
    dfm = pd.merge(new_naics[groupcols],
                   df,
                   how='left',
                   on=groupcols,
                   indicator=True).query('_merge=="left_only"').drop('_merge',
                                                                     axis=1)
    dfm = replace_NoneType_with_empty_cells(dfm)
    dfm = dfm.fillna(0)
    df = pd.concat([df, dfm], sort=True, ignore_index=True)
    # add length column and subset the data
    # subtract out existing data at NAICS6 from total data
    # at a length where no suppressed data
    df = df.assign(secLength=df[sector_column].apply(lambda x: len(x)))

    # add column for each state of sector length where
    # there are no missing values
    df_sup = df_sup.assign(
        secLength=df_sup[sector_column].apply(lambda x: len(x)))
    df_sup2 = (df_sup.groupby(
        ['FlowName', 'Compartment',
         'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index(
             name='secLengthsup'))

    # merge the dfs and sub out the last sector lengths with
    # all data for each state drop states that don't have suppressed dat
    df1 = df.merge(df_sup2)

    df2 = df1[df1['secLength'] == 6].reset_index(drop=True)
    # determine sector to merge on
    df2.loc[:, 'mergeSec'] = df2.apply(
        lambda x: x[sector_column][:x['secLengthsup']], axis=1)

    sum_cols = [
        e for e in fba_default_grouping_fields
        if e not in ['ActivityConsumedBy', 'ActivityProducedBy']
    ]
    sum_cols.append('mergeSec')
    df2 = df2.assign(
        FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum'))
    # rename columns for the merge and define merge cols
    df2 = df2.rename(columns={
        sector_column: 'NewNAICS',
        'mergeSec': sector_column
    })
    # keep flows with 0 flow
    df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True)
    m_cols = groupcols + ['NewNAICS', 'FlowAlloc']
    # merge the two dfs
    dfe = df1.merge(df3[m_cols])
    # add count column used to divide the unallocated flows
    dfe = dfe.assign(
        secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count'))
    dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) /
                     dfe['secCount'])
    # reassign values and drop columns
    dfe = dfe.assign(FlowAmount=dfe['newFlow'])
    dfe[sector_column] = dfe['NewNAICS'].copy()
    dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow'])

    # new df with estimated naics6
    dfn = pd.concat([df, dfe], ignore_index=True)
    dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True)
    dfn2 = dfn2.drop(columns=['secLength'])

    dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields)

    # if activities are source-like, set col values as copies
    # of the sector columns
    if sector_like_activities:
        dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy'])
        dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy'])
        # reindex columns
        dff = dff.reindex(df_load.columns, axis=1)

    # replace null values
    dff = replace_strings_with_NoneType(dff).reset_index(drop=True)
    return dff
Exemplo n.º 14
0
def compare_geographic_totals(df_subset, df_load, sourcename, attr,
                              activity_set, activity_names):
    """
    Check for any data loss between the geoscale used and published
    national data
    :param df_subset: df, after subset by geography
    :param df_load: df, loaded data, including published national data
    :param sourcename: str, source name
    :param attr: dictionary, attributes
    :param activity_set: str, activity set
    :param activity_names: list of names in the activity set by which
           to subset national level data
    :return: df, comparing published national level data to df subset
    """

    # subset df_load to national level
    nat = df_load[df_load['Location'] == US_FIPS].reset_index(
        drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'})
    # if df len is not 0, continue with comparison
    if len(nat) != 0:
        # subset national level data by activity set names
        nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | (
            nat[fba_activity_fields[1]].isin(activity_names))].reset_index(
                drop=True)
        nat = replace_strings_with_NoneType(nat)
        # drop the geoscale in df_subset and sum
        sub = df_subset.assign(Location=US_FIPS)
        # depending on the datasource, might need to rename some
        # strings for national comparison
        sub = rename_column_values_for_comparison(sub, sourcename)
        sub2 = aggregator(sub, fba_default_grouping_fields).rename(
            columns={'FlowAmount': 'FlowAmount_sub'})

        # compare df
        merge_cols = [
            'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment',
            'Location', 'LocationSystem', 'Year'
        ]
        # comapare units
        compare_df_units(nat, sub2)
        df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']],
                        sub2[merge_cols + ['FlowAmount_sub']],
                        how='outer')
        df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] -
                           df_m['FlowAmount_sub'])
        df_m = df_m.assign(Percent_Diff=(
            abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100))
        df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True)
        # subset the merged df to what to include in the validation df
        # include data where percent difference is > 1 or where value is nan
        df_m_sub = df_m[(df_m['Percent_Diff'] > 1) |
                        (df_m['Percent_Diff'].isna())].reset_index(drop=True)

        if len(df_m_sub) == 0:
            vLog.info(
                'No data loss greater than 1%% between national '
                'level data and %s subset', attr['allocation_from_scale'])
        else:
            vLog.info(
                'There are data differences between published national'
                ' values and %s subset, saving to validation log',
                attr['allocation_from_scale'])

            vLogDetailed.info(
                'Comparison of National FlowAmounts to aggregated data '
                'subset for %s: \n {}'.format(df_m_sub.to_string()),
                activity_set)
Exemplo n.º 15
0
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename):
    """
    Replace any non sectors with sectors.
    :param df_load: df with sector columns or sector-like activities
    :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code)
    :return: df, with non-sectors replaced with sectors
    """
    # from flowsa.flowbyfunctions import aggregator

    # drop NoneType
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # load the mastercroswalk and subset by sectorsourcename,
    # save values to list
    cw_load = load_crosswalk('sector_timeseries')
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    if 'SectorConsumedBy' in df:
        column_headers = ['SectorProducedBy', 'SectorConsumedBy']
    else:
        column_headers = ['ActivityProducedBy', 'ActivityConsumedBy']

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is
    # not in crosswalk list
    if len(non_naics) != 0:
        vLog.debug(
            'Checking if sectors represent a different '
            'NAICS year, if so, replace with %s', sectorsourcename)
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the sectorsourcename column,
            # use that value to replace sector in column c if value in
            # column c is in the non_naics list
            df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)),
                             df[sectorsourcename], df[c])
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df[sectorsourcename],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=[sectorsourcename, 'NAICS', 'allocation_ratio'])
        vLog.debug('Replaced NAICS with %s', sectorsourcename)

        # check if there are any sectors that are not in
        # the naics 2012 crosswalk
        vLog.debug('Check again for non NAICS 2012 Codes')
        nonsectors = check_if_sectors_are_naics(df, cw, column_headers)
        if len(nonsectors) != 0:
            vLog.debug('Dropping non-NAICS from dataframe')
            for c in column_headers:
                # drop rows where column value is in the nonnaics list
                df = df[~df[c].isin(nonsectors)]
        # aggregate data
        possible_column_headers = \
            ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability',
             'TemporalCorrelation', 'GeographicalCorrelation',
             'TechnologicalCorrelation', 'DataCollection', 'Description')
        # list of column headers to group aggregation by
        groupby_cols = [
            e for e in df.columns.values.tolist()
            if e not in possible_column_headers
        ]
        df = aggregator(df, groupby_cols)

    # drop rows where both SectorConsumedBy and SectorProducedBy NoneType
    if 'SectorConsumedBy' in df:
        df_drop = df[(df['SectorConsumedBy'].isnull())
                     & (df['SectorProducedBy'].isnull())]
        if len(df_drop) != 0:
            activities_dropped = pd.unique(
                df_drop[['ActivityConsumedBy',
                         'ActivityProducedBy']].values.ravel('K'))
            activities_dropped = list(
                filter(lambda x: x is not None, activities_dropped))
            vLog.debug('Dropping rows where the Activity columns contain %s',
                       ', '.join(activities_dropped))
        df = df[~((df['SectorConsumedBy'].isnull()) &
                  (df['SectorProducedBy'].isnull()))].reset_index(drop=True)
    else:
        df = df[~((df['ActivityConsumedBy'].isnull()) &
                  (df['ActivityProducedBy'].isnull()))].reset_index(drop=True)

    df = replace_strings_with_NoneType(df)

    return df
Exemplo n.º 16
0
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland
    at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector: df, CoA cropland data, FBA format with sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data
    :param sector_column: str, the sector column on which to make
                          df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, CoA cropland data disaggregated
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3])
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS",
                                          year=year,
                                          flowclass='Land')
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(
        drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(
        naics, sectorsourcename=method['target_sector_source'])
    # estimate suppressed data by equally allocating parent to child naics
    naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3,
                                     'USDA_CoA_Cropland_NAICS')
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(
        Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # check units in prep for merge
    compare_df_units(crop, naics4)
    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(
            Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = \
            naics4.loc[naics4[sector_column].apply(lambda x:
                                                   len(x) == i + 1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(
            Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 sector_column, 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=[sector_column, 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(
            columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # equally allocate any further missing naics
    crop = allocate_dropped_sector_data(crop, 'NAICS_6')

    # pasture data
    pasture = \
        fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x:
                                                           x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Exemplo n.º 17
0
def determine_flows_requiring_disaggregation(df_load, attr, method,
                                             sector_column):
    """
    The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW
    employment data to determine land use for different industries. To
    accurately estimate land use per industry, existing FlowAmounts for a
    particular NAICS level (NAICS6) for example, should be subtracted from
    the possible FlowAmounts for other NAICS6 that share the first 5 digits.
    For Example, there is data for '311', '3112', and '311221' in the 2014
    dataset. FlowAmounts for allocation by employment for NAICS6 are based
    on the provided '3112' FlowAmounts. However, since there is data at one
    NAICS6 (311221), the FlowAmount for that NAICS6 should be subtracted
    from other NAICS6 to accurately depict the remaining 'FlowAmount' that
    requires a secondary source (Employment data) for allocation.
    :param df_load: df, EIA MECS Land FBA
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param sector_column: str, sector column to flag ('SectorProducedBy',
        'SectorConsumedBy')
    :return: A dataframe with a column 'disaggregate_flag', if '1',
             row requires secondary source to calculate
             FlowAmount, if '0' FlowAmount does not require modifications
    """

    from flowsa.sectormapping import add_sectors_to_flowbyactivity

    df_load = replace_NoneType_with_empty_cells(df_load)
    # drop rows where there is no value in sector column, which might occur if
    # sector-like activities have a "-" in them
    df_load = df_load[df_load[sector_column] != '']

    # determine activity column
    if sector_column == 'SectorConsumedBy':
        activity_column = 'ActivityConsumedBy'
    else:
        activity_column = 'ActivityProducedBy'

    # original df - subset
    # subset cols of original df
    dfo = df_load[['FlowAmount', 'Location', sector_column]]
    # min and max length
    min_length = min(
        df_load[sector_column].apply(lambda x: len(str(x))).unique())
    max_length = max(
        df_load[sector_column].apply(lambda x: len(str(x))).unique())
    # subset by sector length, creating a df
    for s in range(min_length, max_length + 1):
        df_name = 'dfo_naics' + str(s)
        vars()[df_name] = dfo[dfo[sector_column].apply(
            lambda x: len(x) == s)].reset_index(drop=True)
        vars()[df_name] = vars()[df_name].assign(SectorMatch=vars(
        )[df_name][sector_column].apply(lambda x: x[:len(x) - 1]))
    # loop through the dfs, merging by sector match. If there is a match,
    # subtract the value, if there is not a match, drop last digit in
    # sectormatch, add row to the next df, and repeat
    df_merged = pd.DataFrame()
    df_not_merged = pd.DataFrame()
    for s in range(max_length, min_length, -1):
        df_name_1 = 'dfo_naics' + str(s - 1)
        df_name_2 = 'dfo_naics' + str(s)
        # concat df 1 with df_not_merged
        df2 = pd.concat([vars()[df_name_2], df_not_merged])
        df2 = df2.rename(columns={
            'FlowAmount': 'SubtractFlow',
            sector_column: 'Sector'
        })
        df_m = pd.merge(
            vars()[df_name_1][['FlowAmount', 'Location', sector_column]],
            df2,
            left_on=['Location', sector_column],
            right_on=['Location', 'SectorMatch'],
            indicator=True,
            how='outer')
        # subset by merge and append to appropriate df
        df_both = df_m[df_m['_merge'] == 'both']
        if len(df_both) != 0:
            # drop columns
            df_both1 = df_both.drop(
                columns=['Sector', 'SectorMatch', '_merge'])
            # aggregate before subtracting
            df_both2 = df_both1.groupby(
                ['FlowAmount', 'Location', sector_column],
                as_index=False).agg({"SubtractFlow": sum})
            df_both3 = df_both2.assign(FlowAmount=df_both2['FlowAmount'] -
                                       df_both2['SubtractFlow'])
            df_both3 = df_both3.drop(columns=['SubtractFlow'])
            # drop rows where 0
            # df_both = df_both[df_both['FlowAmount'] != 0]
            df_merged = df_merged.append(df_both3, ignore_index=True)
        df_right = df_m[df_m['_merge'] == 'right_only']
        if len(df_right) != 0:
            df_right = df_right.drop(
                columns=['FlowAmount', sector_column, '_merge'])
            df_right = df_right.rename(columns={
                'SubtractFlow': 'FlowAmount',
                'Sector': sector_column
            })
            # remove another digit from Sectormatch
            df_right = df_right.assign(SectorMatch=df_right[sector_column].
                                       apply(lambda x: x[:(s - 2)]))
            # reorder
            df_right = df_right[[
                'FlowAmount', 'Location', sector_column, 'SectorMatch'
            ]]
            df_not_merged = df_not_merged.append(df_right, ignore_index=True)
    # rename the flowamount column
    df_merged = df_merged.rename(columns={
        'FlowAmount': 'FlowAmountNew',
        sector_column: activity_column
    })
    # In the original EIA MECS df, some of the NAICS 6-digit codes sum
    # to a value greater than published NAICS3, due to rounding. In these
    # cases, the new FlowAmount is a negative number. Reset neg numbers to 0
    df_merged.loc[df_merged['FlowAmountNew'] < 0, 'FlowAmountNew'] = 0
    # in the original df, drop sector columns re-add sectors, this time with
    # sectors = 'aggregated'
    dfn = df_load.drop(columns=[
        'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy',
        'ConsumedBySectorType', 'SectorSourceName'
    ])
    dfn = add_sectors_to_flowbyactivity(
        dfn,
        sectorsourcename=method['target_sector_source'],
        overwrite_sectorlevel='aggregated')
    # add column noting that these columns require an allocation ratio
    dfn = dfn.assign(disaggregate_flag=1)
    # create lists of sectors to drop
    list_original = df_load[activity_column].drop_duplicates().tolist()
    # drop values in original df
    dfn2 = dfn[~dfn[sector_column].isin(list_original)].sort_values(
        [activity_column, sector_column]).reset_index(drop=True)
    # drop the sectors that are duplicated by different naics being
    # mapped to naics6
    if len(dfn2[dfn2.duplicated(subset=['Location', sector_column],
                                keep=False)]) > 0:
        dfn2.drop_duplicates(subset=['Location', sector_column],
                             keep='last',
                             inplace=True)
    # want to allocate at NAICS6, so drop all other sectors
    dfn2 = \
        dfn2[dfn2[sector_column].apply(lambda x: len(x) == 6)].reset_index(
            drop=True).sort_values([sector_column])

    # merge revised flowamounts back with modified original df
    df_to_allocate = dfn2.merge(df_merged, how='left')
    # replace FlowAmount with newly calculated FlowAmount,
    # which represents Flows that are currently unaccounted for at NAICS6
    df_to_allocate['FlowAmount'] = np.where(
        df_to_allocate['FlowAmountNew'].notnull(),
        df_to_allocate['FlowAmountNew'], df_to_allocate['FlowAmount'])
    # drop rows where flow amount = 0 - flows are captured through other NAICS6
    df_to_allocate2 = df_to_allocate[df_to_allocate['FlowAmount'] != 0].drop(
        columns='FlowAmountNew').reset_index(drop=True)

    # merge the original df with modified
    # add column to original df for disaggregate_flag
    df_load = df_load.assign(disaggregate_flag=0)

    # concat the two dfs and sort
    df_c = pd.concat([df_load, df_to_allocate2],
                     ignore_index=True).sort_values([sector_column
                                                     ]).reset_index(drop=True)

    df_c = replace_strings_with_NoneType(df_c).sort_values([sector_column])

    return df_c
Exemplo n.º 18
0
def proportional_allocation_by_location_and_activity(df_load, sectorcolumn):
    """
    Creates a proportional allocation within each aggregated
    sector within a location
    :param df_load: df with sector columns
    :param sectorcolumn: str, sector column for which to create
         allocation ratios
    :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns
    """

    # tmp replace NoneTypes with empty cells
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # want to create denominator based on shortest length naics for each
    # activity/location
    grouping_cols = [
        e for e in [
            'FlowName', 'Location', 'Activity', 'ActivityConsumedBy',
            'ActivityProducedBy', 'Class', 'SourceName', 'Unit', 'FlowType',
            'Compartment', 'Year'
        ] if e in df.columns.values.tolist()
    ]
    activity_cols = [
        e for e in ['Activity', 'ActivityConsumedBy', 'ActivityProducedBy']
        if e in df.columns.values.tolist()
    ]
    # trim whitespace
    df[sectorcolumn] = df[sectorcolumn].str.strip()
    # to create the denominator dataframe first add a column that captures
    # the sector length
    denom_df = df.assign(sLen=df[sectorcolumn].str.len())
    denom_df = denom_df[denom_df['sLen'] == denom_df.groupby(activity_cols)
                        ['sLen'].transform(min)].drop(columns='sLen')
    denom_df.loc[:, 'Denominator'] = \
        denom_df.groupby(grouping_cols)['HelperFlow'].transform('sum')

    # list of column headers, that if exist in df, should be aggregated
    # using the weighted avg fxn
    possible_column_headers = ('Location', 'LocationSystem', 'Year',
                               'Activity', 'ActivityConsumedBy',
                               'ActivityProducedBy')
    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers
        if e in denom_df.columns.values.tolist()
    ]
    merge_headers = column_headers.copy()
    column_headers.append('Denominator')
    # create subset of denominator values based on Locations and Activities
    denom_df_2 = \
        denom_df[column_headers].drop_duplicates().reset_index(drop=True)
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2,
                             how='left',
                             left_on=merge_headers,
                             right_on=merge_headers)
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['HelperFlow'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index(
        drop=True)
    # where parent NAICS are not found in the allocation dataset, make sure
    # those child NAICS are not dropped
    allocation_df['FlowAmountRatio'] = \
        allocation_df['FlowAmountRatio'].fillna(1)
    # fill empty cols with NoneType
    allocation_df = replace_strings_with_NoneType(allocation_df)
    # fill na values with 0
    allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0)

    return allocation_df
Exemplo n.º 19
0
def calculate_flowamount_diff_between_dfs(dfa_load, dfb_load):
    """
    Calculate the differences in FlowAmounts between two dfs
    :param dfa_load: df, initial df
    :param dfb_load: df, modified df
    :return: df, comparing changes in flowamounts between 2 dfs
    """

    # subset the dataframes, only keeping data for easy
    # comparison of flowamounts
    drop_cols = [
        'Year', 'MeasureofSpread', 'Spread', 'DistributionType', 'Min', 'Max',
        'DataReliability', 'DataCollection'
    ]
    # drop cols and rename, ignore error if a df does not
    # contain a column to drop
    dfa = dfa_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Original'})
    dfb = dfb_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Modified'})
    # create df dict for modified dfs created in for loop
    df_list = []
    for d in ['a', 'b']:
        df_name = f'df{d}'
        # assign new column of geoscale by which to aggregate
        vars()[df_name + '2'] = vars()[df_name].assign(geoscale=np.where(
            vars()[df_name]['Location'].apply(lambda x: x.endswith('000')),
            'state', 'county'))
        vars()[df_name + '2'] = vars()[df_name + '2'].assign(
            geoscale=np.where(vars()[df_name +
                                     '2']['Location'] == '00000', 'national',
                              vars()[df_name + '2']['geoscale']))
        # ensure all nan/nones filled/match
        vars()[df_name + '2'] = \
            replace_strings_with_NoneType(vars()[df_name+'2'])
        df_list.append(vars()[df_name + '2'])
    # merge the two dataframes
    df = df_list[0].merge(df_list[1], how='outer')

    # determine if any new data is negative
    dfn = df[df['FlowAmount_Modified'] < 0].reset_index(drop=True)
    if len(dfn) > 0:
        vLog.info('There are negative FlowAmounts in new dataframe, '
                  'see Validation Log')
        vLogDetailed.info('Negative FlowAmounts in new dataframe: '
                          '\n {}'.format(dfn.to_string()))

    # Because code will sometimes change terminology, aggregate
    # data by context and flowable to compare df differences
    # subset df
    dfs = df[[
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'FlowAmount_Original', 'FlowAmount_Modified', 'Unit', 'geoscale'
    ]]
    agg_cols = [
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'Unit', 'geoscale'
    ]
    dfagg = dfs.groupby(agg_cols, dropna=False, as_index=False).agg({
        'FlowAmount_Original':
        sum,
        'FlowAmount_Modified':
        sum
    })
    # column calculating difference
    dfagg['FlowAmount_Difference'] = \
        dfagg['FlowAmount_Modified'] - dfagg['FlowAmount_Original']
    dfagg['Percent_Difference'] = (dfagg['FlowAmount_Difference'] /
                                   dfagg['FlowAmount_Original']) * 100
    # drop rows where difference = 0
    dfagg2 = dfagg[dfagg['FlowAmount_Difference'] != 0].reset_index(drop=True)
    if len(dfagg2) == 0:
        vLogDetailed.info('No FlowAmount differences')
    else:
        # subset df and aggregate, also print out the total
        # aggregate diff at the geoscale
        dfagg3 = replace_strings_with_NoneType(dfagg).drop(columns=[
            'ActivityProducedBy', 'ActivityConsumedBy',
            'FlowAmount_Difference', 'Percent_Difference'
        ])
        dfagg4 = dfagg3.groupby(['Flowable', 'Context', 'Unit', 'geoscale'],
                                dropna=False,
                                as_index=False).agg({
                                    'FlowAmount_Original': sum,
                                    'FlowAmount_Modified': sum
                                })
        # column calculating difference
        dfagg4['FlowAmount_Difference'] = \
            dfagg4['FlowAmount_Modified'] - dfagg4['FlowAmount_Original']
        dfagg4['Percent_Difference'] = (dfagg4['FlowAmount_Difference'] /
                                        dfagg4['FlowAmount_Original']) * 100
        # drop rows where difference = 0
        dfagg5 = dfagg4[dfagg4['FlowAmount_Difference'] != 0].reset_index(
            drop=True)
        vLogDetailed.info('Total FlowAmount differences between dataframes: '
                          '\n {}'.format(dfagg5.to_string(), index=False))

        # save detail output in log file
        vLogDetailed.info('Total FlowAmount differences by Activity Columns: '
                          '\n {}'.format(dfagg2.to_string(), index=False))
Exemplo n.º 20
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after validation.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS
    is not target_level
    :param df: df
    :param target_sector_level: str, final sector level of FBS (ex. NAICS_6)
    :return: df with missing sector level data
    """

    from flowsa.dataclean import replace_NoneType_with_empty_cells
    from flowsa.dataclean import replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_crosswalk('sector_length')
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and
        # target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning(
                'Data found at %s digit NAICS to be allocated: '
                '{}'.format(' '.join(map(str, sector_list))), str(i))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each %s',
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Exemplo n.º 21
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not, sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """

    # determine if activities are sector-like, if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        # load source catalog
        cat = load_source_catalog()
        # for s in pd.unique(flowbyactivity_df['SourceName']):
        s = pd.unique(df_load['SourceName'])[0]
        # load catalog info for source
        src_info = cat[s]
        sector_like_activities = src_info['sector-like_activities']

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [e for e in group_cols if e not in
                      ('ActivityProducedBy', 'ActivityConsumedBy')]
        # subset df
        df_cols = [e for e in df.columns if e not in
                   ('ActivityProducedBy', 'ActivityConsumedBy')]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length - 1, 1, -1):
        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create a list of i digit sectors in df subset
        sector_subset = df_subset[
            ['Location', fbs_activity_fields[0],
             fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True)
        df_sectors = sector_subset.copy()
        df_sectors.loc[:, 'SectorProducedBy'] = \
            df_sectors['SectorProducedBy'].apply(lambda x: x[0:i])
        df_sectors.loc[:, 'SectorConsumedBy'] = \
            df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i])
        sector_list = df_sectors.drop_duplicates().values.tolist()
        # create a list of sectors that are exactly i digits long
        # where either sector column is i digits in length
        df_existing_1 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # where both sector columns are i digits in length
        df_existing_2 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # concat existing dfs
        df_existing = pd.concat([df_existing_1, df_existing_2], sort=False)
        existing_sectors = df_existing.drop_duplicates().dropna().values.tolist()
        # list of sectors of length i that are not in sector list
        missing_sectors = [e for e in sector_list if e not in existing_sectors]
        if len(missing_sectors) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows
            # set conditions
            agg_sectors_list = []
            for q, r, s in missing_sectors:
                c1 = df_subset['Location'] == q
                c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r)
                c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s)
                # subset data
                agg_sectors_list.append(df_subset.loc[c1 & c2 & c3])
            agg_sectors = pd.concat(agg_sectors_list, sort=False)
            agg_sectors = agg_sectors.loc[
                (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) |
                (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))]
            agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply(
                lambda x: x[0:i])
            agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply(
                lambda x: x[0:i])
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(agg_sectors, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)

    # manually modify non-NAICS codes that might exist in sector
    df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorConsumedBy'])  # domestic/household
    df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorProducedBy'])  # domestic/household
    # drop any duplicates created by modifying sector codes
    df = df.drop_duplicates()
    # if activities are source-like, set col values as copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)
    # replace null values
    df = replace_strings_with_NoneType(df)

    return df
Exemplo n.º 22
0
def sector_disaggregation(df):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df, must have sector columns
    :return: A FBS df with values for the missing naics5 and naics6
    """

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df)

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()

    # for loop min length to 6 digits, where min length cannot be less than 2
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).min().min()
    if length < 2:
        length = 2
    # appends missing naics levels to df
    for i in range(length, 6):
        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=
                                     df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=
                                     df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False)
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        # list of column headers, that if exist in df, should be
        # aggregated using the weighted avg fxn
        possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context',
                                   'Compartment', 'Location', 'Year',
                                   'SectorProduced_tmp', 'SectorConsumed_tmp')
        # list of column headers that do exist in the df being subset
        cols_to_drop = [e for e in possible_column_headers if e
                        in df_subset.columns.values.tolist()]

        df_subset = df_subset.drop_duplicates(subset=cols_to_drop,
                                              keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy",
                                            "SectorProduced_tmp", "SectorConsumed_tmp"])
        new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy",
                                              "SCB": "SectorConsumedBy"})
        # append new naics to df
        new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""})
        new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""})
        new_naics = replace_NoneType_with_empty_cells(new_naics)
        df = pd.concat([df, new_naics], sort=True)
    # replace blank strings with None
    df = replace_strings_with_NoneType(df)

    return df
Exemplo n.º 23
0
def update_naics_crosswalk():
    """
    update the useeior crosswalk with crosswalks created for
    flowsa datasets - want to add any NAICS > 6 digits

    Add NAICS 2002
    :return:
    """

    # read useeior master crosswalk, subset NAICS columns
    naics_load = import_useeior_mastercrosswalk()
    naics = naics_load[[
        'NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code'
    ]].drop_duplicates().reset_index(drop=True)
    # convert all rows to string
    naics = naics.astype(str)
    # ensure all None are NoneType
    naics = replace_strings_with_NoneType(naics)
    # drop rows where all None
    naics = naics.dropna(how='all')

    # drop naics > 6 in mastercrosswalk (all manufacturing) because unused and slows functions
    naics = naics[naics['NAICS_2012_Code'].apply(
        lambda x: len(x) < 7)].reset_index(drop=True)

    # find any NAICS where length > 6 that are used for allocation purposes and add to naics list
    missing_naics_df_list = []
    # read in all the crosswalk csv files (ends in toNAICS.csv)
    for file_name in glob.glob(datapath + "activitytosectormapping/" +
                               '*_toNAICS.csv'):
        # skip Statistics Canada GDP because not all sectors relevant
        if file_name != crosswalkpath + 'Crosswalk_StatCan_GDP_toNAICS.csv':
            df = pd.read_csv(file_name, low_memory=False, dtype=str)
            # convert all rows to string
            df = df.astype(str)
            # determine sector year
            naics_year = df['SectorSourceName'].all()
            # subset dataframe so only sector
            df = df[['Sector']]
            # trim whitespace and cast as string, rename column
            df['Sector'] = df['Sector'].astype(str).str.strip()
            df = df.rename(columns={'Sector': naics_year})
            # extract sector year column from master crosswalk
            df_naics = naics[[naics_year]]
            # find any NAICS that are in source crosswalk but not in mastercrosswalk
            common = df.merge(df_naics, on=[naics_year, naics_year])
            missing_naics = df[(~df[naics_year].isin(common[naics_year]))]
            # extract sectors where len > 6 and that does not include a '-'
            missing_naics = missing_naics[missing_naics[naics_year].apply(
                lambda x: len(x) > 6)]
            if len(missing_naics) != 0:
                missing_naics = missing_naics[~missing_naics[naics_year].str.
                                              contains('-')]
                # append to df list
                missing_naics_df_list.append(missing_naics)
    # concat df list and drop duplications
    missing_naics_df = \
        pd.concat(missing_naics_df_list,
                  ignore_index=True, sort=False).drop_duplicates().reset_index(drop=True)
    # sort df
    missing_naics_df = missing_naics_df.sort_values(['NAICS_2012_Code'])
    missing_naics_df = missing_naics_df.reset_index(drop=True)

    # add missing naics to master naics crosswalk
    total_naics = naics.append(missing_naics_df, ignore_index=True)

    # sort df
    total_naics = total_naics.sort_values(
        ['NAICS_2012_Code', 'NAICS_2007_Code']).drop_duplicates()
    total_naics = \
        total_naics[~total_naics['NAICS_2012_Code'].isin(['None', 'unknown', 'nan',
                                                          'Unknown', np.nan]
                                                         )].reset_index(drop=True)

    # convert all columns to string
    total_naics = total_naics.astype(str)

    # add naics 2002
    naics_02 = load_naics_02_to_07_crosswalk()
    naics_cw = pd.merge(total_naics, naics_02, how='left')

    # ensure NoneType
    naics_cw = replace_strings_with_NoneType(naics_cw)

    # reorder
    naics_cw = naics_cw[[
        'NAICS_2002_Code', 'NAICS_2007_Code', 'NAICS_2012_Code',
        'NAICS_2017_Code'
    ]]

    # save as csv
    naics_cw.to_csv(datapath + "NAICS_Crosswalk.csv", index=False)
Exemplo n.º 24
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not,
    sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or
       a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """
    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]
                 ]].apply(lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length, 2, -1):
        # df where either sector column is length or both columns are
        df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))]

        # add new columns dropping last digit of sectors
        df1 = df1.assign(
            SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1]))
        df1 = df1.assign(
            SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1]))

        # second dataframe where length is l - 1
        df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))
                    )].rename(columns={
                        'SectorProducedBy': 'SPB',
                        'SectorConsumedBy': 'SCB'
                    })

        # merge the dfs
        merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')]
        # also drop activity and description cols
        merge_cols = [
            c for c in merge_cols if c not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

        if len(df2) > 0:
            dfm = df1.merge(df2[merge_cols],
                            how='outer',
                            on=merge_cols,
                            indicator=True).query('_merge=="left_only"').drop(
                                '_merge', axis=1)
        else:
            dfm = df1.copy(deep=True)

        if len(dfm) > 0:
            # replace the SCB and SPB columns then aggregate and add to df
            dfm['SectorProducedBy'] = dfm['SPB']
            dfm['SectorConsumedBy'] = dfm['SCB']
            dfm = dfm.drop(columns=(['SPB', 'SCB']))
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(dfm, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)
    df = df.drop_duplicates()

    # if activities are source-like, set col values as
    # copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)

    # replace null values
    df = replace_strings_with_NoneType(df).reset_index(drop=True)

    return df