Exemplo n.º 1
0
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale):
    from stewi.globals import weighted_average

    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']
    if 'MetaSources' in df:
        grouping_vars.append('MetaSources')

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'DataReliability',
                                              'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping
    fbs = map_elementary_flows(fbs, list(inventory_dict.keys()))

    # rename columns to match flowbysector format
    fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data, some of these fields may need to change
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['SectorSourceName'] = 'NAICS_2012_Code'
    fbs['FlowType'] = 'ELEMENTARY_FLOW'

    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

    fbs = check_for_missing_sector_data(fbs, NAICS_level)

    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    # check the sector codes to make sure NAICS 2012 codes
    fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code')

    return fbs
Exemplo n.º 2
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__],
                                v["clean_fba_df_fxn"])(flows)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                log.info("Preparing to handle subset of flownames " +
                         ', '.join(map(str, names)) + " in " + k)
                # subset fba data by activity
                flows_subset = flows[
                    (flows[fba_activity_fields[0]].isin(names)) |
                    (flows[fba_activity_fields[1]].isin(names))].reset_index(
                        drop=True)

                # extract relevant geoscale data or aggregate existing data
                log.info("Subsetting/aggregating dataframe to " +
                         attr['allocation_from_scale'] + " geoscale")
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset, v['geoscale_to_use'],
                    attr['allocation_from_scale'])

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to " + k)
                flow_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__],
                                               v["clean_fba_w_sec_df_fxn"])(
                                                   flow_subset_wsec, attr=attr)

                # map df to elementary flows
                log.info("Mapping flows in " + k +
                         ' to federal elementary flow list')
                if 'fedefl_mapping' in v:
                    mapping_files = v['fedefl_mapping']
                else:
                    mapping_files = k

                flow_subset_mapped = map_elementary_flows(
                    flow_subset_wsec, mapping_files)

                # clean up mapped fba with sectors, if specified in yaml
                if "clean_mapped_fba_w_sec_df_fxn" in v:
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_mapped = getattr(
                        sys.modules[__name__],
                        v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped,
                                                            attr, method)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' +
                             ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_mapped.copy()
                    # for each activity, if activities are not sector like, check that there is no data loss
                    if load_source_catalog(
                    )[k]['sector-like_activities'] is False:
                        activity_list = []
                        for n in names:
                            log.info('Checking for ' + n + ' at ' +
                                     method['target_sector_level'])
                            fbs_subset = fbs[(
                                (fbs[fba_activity_fields[0]] == n) &
                                (fbs[fba_activity_fields[1]] == n)) |
                                             (fbs[fba_activity_fields[0]] == n)
                                             |
                                             (fbs[fba_activity_fields[1]] == n
                                              )].reset_index(drop=True)
                            fbs_subset = check_if_losing_sector_data(
                                fbs_subset, method['target_sector_level'])
                            activity_list.append(fbs_subset)
                        fbs = pd.concat(activity_list, ignore_index=True)

                # if allocation method for an activity set requires a specific function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    log.info(
                        'Calling on function specified in method yaml to allocate '
                        + ', '.join(map(str, names)) + ' to sectors')
                    fbs = getattr(sys.modules[__name__],
                                  attr['allocation_source'])(
                                      flow_subset_mapped, attr, fbs_list)

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " +
                             attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(
                        flowclass=[attr['allocation_source_class']],
                        datasource=attr['allocation_source'],
                        years=[attr['allocation_source_year']
                               ]).reset_index(drop=True)

                    # clean df and harmonize unites
                    fba_allocation = clean_df(fba_allocation,
                                              flow_by_activity_fields,
                                              fba_fill_na_dict)
                    fba_allocation = harmonize_units(fba_allocation)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " +
                             attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(
                        fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    fba_allocation = subset_df_by_geoscale(
                        fba_allocation, attr['allocation_from_scale'],
                        v['geoscale_to_use'])

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['FlowName'].isin(
                                attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(
                                attr['allocation_compartment'])]

                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(
                                                     fba_allocation, attr=attr)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation_wsec = add_sectors_to_flowbyactivity(
                        fba_allocation,
                        sectorsourcename=method['target_sector_source'])

                    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
                    if 'clean_allocation_fba_w_sec' in attr:
                        log.info("Further disaggregating sectors in " +
                                 attr['allocation_source'])
                        fba_allocation_wsec = getattr(
                            sys.modules[__name__],
                            attr["clean_allocation_fba_w_sec"])(
                                fba_allocation_wsec, attr=attr, method=method)

                    # subset fba datasets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] +
                             " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(
                        fba_allocation_wsec,
                        k,
                        names,
                        flowSubsetMapped=flow_subset_mapped,
                        allocMethod=attr['allocation_method'])

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info(
                            "Using the specified allocation help for subset of "
                            + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(
                            fba_allocation_subset, attr, method, v)

                    # create flow allocation ratios for each activity
                    # if load_source_catalog()[k]['sector-like_activities']
                    flow_alloc_list = []
                    group_cols = fba_mapped_default_grouping_fields
                    group_cols = [
                        e for e in group_cols
                        if e not in ('ActivityProducedBy',
                                     'ActivityConsumedBy')
                    ]
                    for n in names:
                        log.info("Creating allocation ratios for " + n)
                        fba_allocation_subset_2 = get_fba_allocation_subset(
                            fba_allocation_subset,
                            k, [n],
                            flowSubsetMapped=flow_subset_mapped,
                            allocMethod=attr['allocation_method'])
                        if len(fba_allocation_subset_2) == 0:
                            log.info("No data found to allocate " + n)
                        else:
                            flow_alloc = allocate_by_sector(
                                fba_allocation_subset_2,
                                k,
                                attr['allocation_source'],
                                attr['allocation_method'],
                                group_cols,
                                flowSubsetMapped=flow_subset_mapped)
                            flow_alloc = flow_alloc.assign(FBA_Activity=n)
                            flow_alloc_list.append(flow_alloc)
                    flow_allocation = pd.concat(flow_alloc_list,
                                                ignore_index=True)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " +
                             attr['allocation_source'])
                    flow_allocation = collapse_activity_fields(flow_allocation)

                    # check for issues with allocation ratios
                    check_allocation_ratios(flow_allocation, aset, k,
                                            method_name)

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_mapped = flow_subset_mapped.loc[
                        (flow_subset_mapped[fbs_activity_fields[0]].
                         isin(sector_list)) |
                        (flow_subset_mapped[fbs_activity_fields[1]].
                         isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info(
                        "Checking if flowbyactivity and allocation dataframes use the same location systems"
                    )
                    check_if_location_systems_match(flow_subset_mapped,
                                                    flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " +
                             attr['allocation_source'])
                    fbs = flow_subset_mapped.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorProducedBy',
                            'ActivityProducedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    fbs = fbs.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorConsumedBy',
                            'ActivityConsumedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs[
                        'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                    # fill null rows with 0 because no allocation info
                    fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'],
                                     how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[
                        'FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=[
                        'Sector_x', 'FlowAmountRatio_x', 'Sector_y',
                        'FlowAmountRatio_y', 'FlowAmountRatio',
                        'FBA_Activity_x', 'FBA_Activity_y'
                    ])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                # todo: replace with fxn return_from_scale
                log.info("Aggregating flowbysector to " +
                         method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[
                        attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale,
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict)

                # check if any sector information is lost before reaching the target sector length, if so,
                # allocate values equally to disaggregated sectors
                log.info('Checking for data at ' +
                         method['target_sector_level'])
                fbs_agg_2 = check_if_losing_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                # todo: modify fxn to work if activities are sector like in df being allocated
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    check_for_differences_between_fba_load_and_fbs_output(
                        flow_subset_mapped, fbs_agg_2, aset, k, method_name)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index(
                        drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull())
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_load_and_fbs_output_totals(flows_subset_geo,
                                                       fbs_sector_subset, aset,
                                                       k, method_name, attr,
                                                       method, mapping_files)

                log.info(
                    "Completed flowbysector for activity subset with flows " +
                    ', '.join(map(str, names)))
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # add missing fields, ensure correct data type, reorder columns
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Exemplo n.º 3
0
            ]
        activities = aset_names[aset_names['activity_set'].isin(asets)]['name']
        test_fba = test_fba[test_fba['ActivityProducedBy'].isin(activities)]
    return test_fba

def get_fbs_subset(name):
    test_fbs = flowsa.getFlowBySector(name)
    return test_fbs
    

if __name__ == '__main__':
    fba = get_fba_subset(fba_source, fba_year, fba_class)
    fba = agg_by_geoscale(fba, fba_agg,'national', fba_default_grouping_fields)
    
    fba = fba[['FlowName','ActivityProducedBy','FlowAmount','Unit','Compartment']]
    fba = map_elementary_flows(fba, 'NEI')
    fba_pivot = pd.pivot_table(fba, values = 'FlowAmount', index =['Flowable'], columns='ActivityProducedBy', aggfunc='sum', margins = True).reset_index()
    
    fbs = get_fbs_subset(fbs_source)
    fbs = fbs[['Flowable','SectorProducedBy','FlowAmount']]
    fbs_pivot = pd.pivot_table(fbs, values = 'FlowAmount', index =['Flowable'], columns='SectorProducedBy', aggfunc='sum', margins = True).reset_index()
    
    fba = fba.groupby('Flowable').agg({'FlowAmount': 'sum'})
    fba.rename(columns={'FlowAmount':'FBA_amount'}, inplace=True)
    fbs = fbs.groupby('Flowable').agg({'FlowAmount': 'sum'})
    fbs.rename(columns={'FlowAmount':'FBS_amount'}, inplace=True)

    comparison = fba.merge(fbs, how='outer', on ='Flowable')
    comparison['Ratio'] = comparison['FBS_amount'] / comparison ['FBA_amount']    
    
    
Exemplo n.º 4
0
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set,
                                           source_name, method_name, attr,
                                           method, mapping_files):
    """
    Function to compare the loaded flowbyactivity total with the final flowbysector output total
    :param df:
    :return:
    """

    from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation
    from flowsa.common import load_source_catalog
    from flowsa.mapping import map_elementary_flows

    log.info(
        'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total'
    )

    # load source catalog
    cat = load_source_catalog()
    src_info = cat[source_name]

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'],
                                method['target_geoscale'])
    # map loaded fba
    fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True)
    if src_info['sector-like_activities']:
        # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[
            'FBS_amount']
        df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] /
                                          df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts
        context_list = df_merge['Context'].to_list()

        # loop through the contexts and print results of comparison
        for i in context_list:
            df_merge_subset = df_merge[df_merge['Context'] == i].reset_index(
                drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            # make reporting more manageable
            if abs(diff_per) > 0.001:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% less than the total FlowByActivity FlowAmount')
            else:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% more than the total FlowByActivity FlowAmount')

        # save csv to output folder
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for '
            + activity_set + ' in output folder')
        # output data at all sector lengths
        df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" +
                        method_name + '_' + source_name +
                        "_FBA_total_to_FBS_total_FlowAmount_comparison_" +
                        activity_set + ".csv",
                        index=False)

    except:
        log.info(
            'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector'
        )

    return None
Exemplo n.º 5
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows)

            flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                names = attr['names']
                log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)

                # check if flowbyactivity data exists at specified geoscale to use
                flow_subset_list = []
                for n in names:
                    # subset usgs data by activity
                    flow_subset = flows[(flows[fba_activity_fields[0]] == n) |
                                        (flows[fba_activity_fields[1]] == n)].reset_index(drop=True)
                    log.info("Checking if flowbyactivity data exists for " + n + " at the " +
                             v['geoscale_to_use'] + ' level')
                    geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n)
                    # aggregate geographically to the scale of the allocation dataset
                    if geocheck == "Yes":
                        activity_from_scale = v['geoscale_to_use']
                    else:
                        # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                        # geoscale, and sum to specified geoscale
                        log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level")
                        activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset,
                                                                                               v['geoscale_to_use'], n)

                    activity_to_scale = attr['allocation_from_scale']
                    # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale
                    if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale,
                                                      fba_default_grouping_fields, n)
                    # else, aggregate to geoscale want to use
                    elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'],
                                                      fba_default_grouping_fields, n)
                    # else, if usgs is more aggregated than allocation table, filter relevant rows
                    else:
                        log.info("Subsetting " + activity_from_scale + " data")
                        flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n)

                    # Add sectors to df activity, depending on level of specified sector aggregation
                    log.info("Adding sectors to " + k + " for " + n)
                    flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                                     sectorsourcename=method['target_sector_source'],
                                                                     levelofSectoragg=attr['activity_sector_aggregation'])
                    flow_subset_list.append(flow_subset_wsec)
                flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True)

                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k + " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr)

                # map df to elementary flows - commented out until mapping complete
                log.info("Mapping flows in " + k + ' to federal elementary flow list')
                flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_wsec.copy()

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                              datasource=attr['allocation_source'],
                                                              years=[attr['allocation_source_year']]).reset_index(drop=True)

                    fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict)

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(fba_allocation)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    from_scale = attr['allocation_from_scale']
                    to_scale = v['geoscale_to_use']
                    # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                    if fips_number_key[from_scale] > fips_number_key[to_scale]:
                        fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale,
                                                         fba_default_grouping_fields, names)
                    # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                    else:
                        fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                                   sectorsourcename=method['target_sector_source'],
                                                                   levelofSectoragg=attr['allocation_sector_aggregation'])

                    # subset fba datsets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " + attr['allocation_source'])
                    fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                    # drop columns
                    fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                    # call on fxn to further disaggregate the fba allocation data, if exists
                    if 'allocation_disaggregation_fxn' in attr:
                        log.info("Futher disaggregating sectors in " + attr['allocation_source'])
                        fba_allocation_subset = getattr(sys.modules[__name__],
                                                        attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr)

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                    # create flow allocation ratios
                    log.info("Creating allocation ratios for " + attr['allocation_source'])
                    flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_wsec = flow_subset_wsec.loc[
                        (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                        (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                    check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                    fbs = flow_subset_wsec.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorProducedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    fbs = fbs.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorConsumedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                            'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # clean df
                fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs = sector_aggregation(fbs, fbs_default_grouping_fields)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs = sector_disaggregation(fbs)

                # test agg by sector
                # sector_agg_comparison = sector_flow_comparision(fbs)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])
                # add any non-NAICS sectors used with NAICS
                sector_list = add_non_naics_sectors(sector_list, method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) &
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False)

                # set source name
                fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source']

                log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
                fbss.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            fbss.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    fbss = fbss.replace({'nan': None})
    # add missing fields, ensure correct data type, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Exemplo n.º 6
0
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_scale, compartments):
    """
    Returns emissions from stewicombo in fbs format
    :param inventory_dict: a dictionary of inventory types and years (e.g., 
                {'NEI':'2017', 'TRI':'2017'})
    :param NAICS_level: desired NAICS aggregation level, using sector_level_key,
                should match target_sector_level
    :param geo_scale: desired geographic aggregation level ('national', 'state',
                'county'), should match target_geoscale
    :param compartments: list of compartments to include (e.g., 'water', 'air',
                'soil'), use None to include all compartments
    """

    from stewi.globals import output_dir as stw_output_dir
    from stewi.globals import weighted_average
    import stewi
    import stewicombo
    import facilitymatcher
    from stewicombo.overlaphandler import remove_default_flow_overlaps
    from stewicombo.globals import addChemicalMatches
    from facilitymatcher import output_dir as fm_output_dir

    NAICS_level_value = sector_level_key[NAICS_level]
    ## run stewicombo to combine inventories, filter for LCI, remove overlap
    df = stewicombo.combineFullInventories(inventory_dict,
                                           filter_for_LCI=True,
                                           remove_overlap=True,
                                           compartments=compartments)
    df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'],
            inplace=True)

    facility_mapping = pd.DataFrame()
    # load facility data from stewi output directory, keeping only the facility IDs, and geographic information
    inventory_list = list(inventory_dict.keys())
    for i in range(len(inventory_dict)):
        # define inventory name as inventory type + inventory year (e.g., NEI_2017)
        inventory_name = inventory_list[i] + '_' + list(
            inventory_dict.values())[i]
        facilities = pd.read_csv(stw_output_dir + 'facility/' +
                                 inventory_name + '.csv',
                                 usecols=['FacilityID', 'State', 'County'],
                                 dtype={'FacilityID': str})
        if len(facilities[facilities.duplicated(subset='FacilityID',
                                                keep=False)]) > 0:
            log.info('Duplicate facilities in ' + inventory_name +
                     ' - keeping first listed')
            facilities.drop_duplicates(subset='FacilityID',
                                       keep='first',
                                       inplace=True)
        facility_mapping = facility_mapping.append(facilities)

    # Apply FIPS to facility locations
    facility_mapping = apply_county_FIPS(facility_mapping)

    ## merge dataframes to assign facility information based on facility IDs
    df = pd.merge(df, facility_mapping, how='left', on='FacilityID')

    ## Access NAICS From facility matcher and assign based on FRS_ID
    all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list(
        frs_id_list=None, inventories_of_interest_list=inventory_list)
    all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY']
    all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True)
    all_NAICS = naics_expansion(all_NAICS)
    if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'],
                                          keep=False)]) > 0:
        log.info('Duplicate primary NAICS reported - keeping first')
        all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'],
                                  keep='first',
                                  inplace=True)
    df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source'])

    # add levelized NAICS code prior to aggregation
    df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value]

    ## subtract emissions for air transportation from airports in NEI
    airport_NAICS = '4881'
    air_transportation_SCC = '2275020000'
    air_transportation_naics = '481111'
    if 'NEI' in inventory_list:
        log.info('Reassigning emissions from air transportation from airports')

        # obtain and prepare SCC dataset
        df_airplanes = stewi.getInventory('NEI',
                                          inventory_dict['NEI'],
                                          stewiformat='flowbySCC')
        df_airplanes = df_airplanes[df_airplanes['SCC'] ==
                                    air_transportation_SCC]
        df_airplanes['Source'] = 'NEI'
        df_airplanes = addChemicalMatches(df_airplanes)
        df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True)
        df_airplanes.drop(columns=['SCC'], inplace=True)

        facility_mapping_air = df[['FacilityID', 'NAICS']]
        facility_mapping_air.drop_duplicates(keep='first', inplace=True)
        df_airplanes = df_airplanes.merge(facility_mapping_air,
                                          how='left',
                                          on='FacilityID')

        df_airplanes['Year'] = inventory_dict['NEI']
        df_airplanes = df_airplanes[
            df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS]

        # subtract airplane emissions from airport NAICS at individual facilities
        df_planeemissions = df_airplanes[[
            'FacilityID', 'FlowName', 'FlowAmount'
        ]]
        df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'},
                                 inplace=True)
        df = df.merge(df_planeemissions,
                      how='left',
                      on=['FacilityID', 'FlowName'])
        df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0)
        df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions']
        df.drop(columns=['PlaneEmissions'], inplace=True)

        # add airplane emissions under air transport NAICS
        df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[
            0:NAICS_level_value]
        df = pd.concat([df, df_airplanes], ignore_index=True)

    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore',
                                              'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping
    fbs = map_elementary_flows(fbs, inventory_list)

    # rename columns to match flowbysector format
    fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data, some of these fields may need to change
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['SectorSourceName'] = 'NAICS_2012_Code'
    fbs['FlowType'] = 'ELEMENTARY_FLOW'

    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    return fbs