示例#1
0
def usgs_fba_data_cleanup(df):
    """
    Clean up the dataframe to prepare for flowbysector. Used in flowbysector.py
    :param df: df, FBA format
    :return: df, modified FBA
    """

    # drop rows of commercial data (because only exists for 3 states),
    # causes issues because linked with public supply
    # also drop closed-loop or once-through cooling (thermoelectric power)
    # to avoid double counting
    vLogDetailed.info('Removing all rows for Commercial Data because does not '
                      'exist for all states and causes issues as information '
                      'on Public Supply deliveries.')
    dfa = df[~df['Description'].str.lower().str.
             contains('commercial|closed-loop cooling|once-through')]
    calculate_flowamount_diff_between_dfs(df, dfa)
    # calculated NET PUBLIC SUPPLY by subtracting out deliveries to domestic
    vLogDetailed.info('Modify the public supply values to generate '
                      'NET public supply by subtracting out deliveries '
                      'to domestic')
    dfb = calculate_net_public_supply(dfa)

    # check that golf + crop = total irrigation, if not,
    # assign all of total irrigation to crop
    vLogDetailed.info('If states do not distinguish between golf and crop '
                      'irrigation as a subset of total irrigation, assign '
                      'all of total irrigation to crop')
    dfc = check_golf_and_crop_irrigation_totals(dfb)

    # national
    df1 = dfc[dfc['Location'] == US_FIPS]

    # drop flowname = 'total' rows when possible to prevent double counting
    # subset data where flowname = total and where it does not
    vLogDetailed.info('Drop rows where the FlowName is total to prevent'
                      'double counting at the state and county levels. '
                      'Retain rows at national level')
    df2 = dfc[dfc['FlowName'] == 'total']
    # set conditions for data to keep when flowname = 'total
    c1 = df2['Location'] != US_FIPS
    c2 = (~df2['ActivityProducedBy'].isnull()) & \
         (~df2['ActivityConsumedBy'].isnull())
    # subset data
    df2 = df2[c1 & c2].reset_index(drop=True)

    # second subset doesn't have total flowname or total compartment
    df3 = dfc[dfc['FlowName'] != 'total']
    df3 = df3[df3['Compartment'] != 'total']
    df3 = df3[df3['Location'] != US_FIPS]

    # concat the two df
    dfd = pd.concat([df1, df2, df3], ignore_index=True, sort=False)

    # In 2015, there is data for consumptive water use for
    # thermo and crop, drop because do not calculate consumptive water loss
    # for all water categories
    dfd = dfd[dfd['Compartment'] != 'air'].reset_index(drop=True)

    return dfd
示例#2
0
def mecs_land_fba_cleanup_for_land_2012_fbs(fba):
    """
    The 'land_national_2012' FlowBySector uses MECS 2014 data, set
    MECS year to 2012
    :param fba: df, EIA MECS Land, FBA format
    :return: df, EIA MECS Land FBA modified
    """

    fba = mecs_land_fba_cleanup(fba)

    # reset the EIA MECS Land year from 2014 to 2012 to match
    # the USDA ERS MLU year
    vLogDetailed.info('Resetting year from 2014 to 2012')
    fba['Year'] = 2012

    return fba
示例#3
0
def calculate_total_facility_land_area(df):
    """
    In land use calculations, in addition to the provided floor area
    of buildings, estimate other related land area associated with
    commercial facilities (parking, signage, and landscaped area)
    :param df: df, eia cbecs land
    :return: df, modified eia cbecs land that incorporates additional land area
    for each activity
    """

    floor_space_to_land_area_ratio = \
        get_commercial_and_manufacturing_floorspace_to_land_area_ratio()

    vLogDetailed.info('Modifying FlowAmounts - Assuming the floor space to '
                      'land area ratio is 1:4')
    df = df.assign(
        FlowAmount=(df['FlowAmount'] / floor_space_to_land_area_ratio) -
        df['FlowAmount'])

    return df
示例#4
0
def cbecs_land_fba_cleanup(fba_load):
    """
    Clean up the land fba for use in allocation
    :param fba_load: df, eia cbecs land flowbyactivity format
    :return: df, flowbyactivity with modified values
    """

    # estimate floor space using number of floors
    fba = calculate_floorspace_based_on_number_of_floors(fba_load)

    # calculate the land area in addition to building footprint
    fba1 = calculate_total_facility_land_area(fba)

    # drop activities of 'all buildings' to avoid double counting
    fba2 = fba1[fba1['ActivityConsumedBy'] != 'All buildings'].reset_index(
        drop=True)
    vLogDetailed.info('Drop the principle building activity "All buildings" '
                      'to avoid double counting')
    calculate_flowamount_diff_between_dfs(fba1, fba2)

    return fba2
示例#5
0
def calculate_floorspace_based_on_number_of_floors(fba_load):
    """
    Estimate total floorspace for each building type based on data
    on the number of floors for each building type.
    Assumptions (Taken from Yang's static satellite tables):
    1. When floor range is 4-9, assume 6 stories
    2. When floor range is 10 or more, assume 15 stories
    :param fba_load: df, eia cbecs land flowbyactivity
    :return: df, eia cbecs land fba with estimated total floorspace
    """

    # disaggregate mercentile to malls and non malls
    fba = disaggregate_eia_cbecs_mercentile(fba_load)
    vLogDetailed.info('Calculate floorspace for mall and nonmall buildings '
                      'with different number of floors. Once calculated, '
                      'drop mercantile data from dataframe to avoid double '
                      'counting.')
    calculate_flowamount_diff_between_dfs(fba_load, fba)

    # disaggregate other and vacant
    fba2 = disaggregate_eia_cbecs_vacant_and_other(fba)
    vLogDetailed.info('Due to data suppression for floorspace by building '
                      'number of floors, some data is lost when dropping '
                      'floorspace for all buildings within a principle '
                      'building activity. To avoid this data loss, all '
                      'remaining floorspace for "All buildings" by number of '
                      'floors is allocated to "Vacant" and "Other" principle '
                      'building activities, as these activities are allocated '
                      'to all commercial building sectors. This assumption '
                      'results in a total floorspace increase for "Vacant" '
                      'and "Other" activities.')
    calculate_flowamount_diff_between_dfs(fba, fba2)

    # drop data for 'all buildings'
    fba3 = fba2[fba2['Description'] != 'All buildings']
    # add column 'DivisionFactor' based on description
    fba3 = fba3.assign(DivisionFactor=fba3['Description'].apply(lambda x: (
        1 if 'One' in x else (2 if 'Two' in x else (3 if 'Three' in x else (
            6 if 'Four' in x else (15 if 'Ten' in x else "")))))))
    # modify flowamounts to represent building footprint rather than
    # total floorspace
    fba3['FlowAmount'] = fba3['FlowAmount'] / fba3['DivisionFactor']
    # sum values for single flowamount for each bulding type
    vLogDetailed.info('Drop flows for "All Buildings" to avoid double '
                      'counting, as maintain floorspace by buildings based '
                      'on number of floors. Also dividing total floorspace '
                      'by number of floors to calculate a building footprint. '
                      'Calculates result in reduced FlowAmount for all '
                      'categories.')
    calculate_flowamount_diff_between_dfs(fba2, fba3)
    # rename the FlowAmounts and sum so total floorspace, rather than have
    # multiple rows based on floors
    fba3 = fba3.assign(
        FlowName=fba3['FlowName'].apply(lambda x: ','.join(x.split(',')[:-1])))
    # modify the description
    fba3 = fba3.assign(Description='Building Footprint')
    groupbycols = fba_mapped_default_grouping_fields
    fba4 = aggregator(fba3, groupbycols)

    return fba4
示例#6
0
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list):
    """
    From the USDA ERS MLU 2012 report:
    "Includes miscellaneous other uses, such as industrial and commercial
    sites in rural areas, cemeteries, golf courses, mining areas, quarry sites,
    marshes, swamps, sand dunes, bare rocks, deserts, tundra,
    rural residential, and other unclassified land. In this report,
    urban land is reported as a separate category."

    Mining data is calculated using a separate source = BLM PLS.
    Want to extract rural residential land area from total value of
    'Other Land'
    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior to the activity
                     set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # land in rural residential lots
    rural_res = get_area_of_rural_land_occupied_by_houses_2013()

    # household codes
    household = load_crosswalk('household')
    household = household['Code'].drop_duplicates().tolist()

    # in df, where sector is a personal expenditure value, and
    # location = 00000, replace with rural res value
    vLogDetailed.info('The only category for MLU other land use is rural land '
                      'occupation. All other land area in this category is '
                      'unassigned to sectors, resulting in unaccounted land '
                      'area.')
    df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household),
                                rural_res, df['FlowAmount'])

    return df
示例#7
0
def compare_geographic_totals(df_subset, df_load, sourcename, attr,
                              activity_set, activity_names):
    """
    Check for any data loss between the geoscale used and published
    national data
    :param df_subset: df, after subset by geography
    :param df_load: df, loaded data, including published national data
    :param sourcename: str, source name
    :param attr: dictionary, attributes
    :param activity_set: str, activity set
    :param activity_names: list of names in the activity set by which
           to subset national level data
    :return: df, comparing published national level data to df subset
    """

    # subset df_load to national level
    nat = df_load[df_load['Location'] == US_FIPS].reset_index(
        drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'})
    # if df len is not 0, continue with comparison
    if len(nat) != 0:
        # subset national level data by activity set names
        nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | (
            nat[fba_activity_fields[1]].isin(activity_names))].reset_index(
                drop=True)
        nat = replace_strings_with_NoneType(nat)
        # drop the geoscale in df_subset and sum
        sub = df_subset.assign(Location=US_FIPS)
        # depending on the datasource, might need to rename some
        # strings for national comparison
        sub = rename_column_values_for_comparison(sub, sourcename)
        sub2 = aggregator(sub, fba_default_grouping_fields).rename(
            columns={'FlowAmount': 'FlowAmount_sub'})

        # compare df
        merge_cols = [
            'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment',
            'Location', 'LocationSystem', 'Year'
        ]
        # comapare units
        compare_df_units(nat, sub2)
        df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']],
                        sub2[merge_cols + ['FlowAmount_sub']],
                        how='outer')
        df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] -
                           df_m['FlowAmount_sub'])
        df_m = df_m.assign(Percent_Diff=(
            abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100))
        df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True)
        # subset the merged df to what to include in the validation df
        # include data where percent difference is > 1 or where value is nan
        df_m_sub = df_m[(df_m['Percent_Diff'] > 1) |
                        (df_m['Percent_Diff'].isna())].reset_index(drop=True)

        if len(df_m_sub) == 0:
            vLog.info(
                'No data loss greater than 1%% between national '
                'level data and %s subset', attr['allocation_from_scale'])
        else:
            vLog.info(
                'There are data differences between published national'
                ' values and %s subset, saving to validation log',
                attr['allocation_from_scale'])

            vLogDetailed.info(
                'Comparison of National FlowAmounts to aggregated data '
                'subset for %s: \n {}'.format(df_m_sub.to_string()),
                activity_set)
示例#8
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after
    subsetting by activity and geography with the final flowbysector output
    total. Not a direct comparison of the loaded FBA because FBAs are
    modified before being subset by activity for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml
        for source data
    :param activity_attr: dictionary, attribute data from method yaml
        for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output
        totals by location, save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # determine from scale
    if fips_number_key[source_attr['geoscale_to_use']] < \
            fips_number_key[activity_attr['allocation_from_scale']]:
        from_scale = source_attr['geoscale_to_use']
    else:
        from_scale = activity_attr['allocation_from_scale']

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, from_scale,
                                method['target_geoscale'])
    if check_activities_sector_like(source_name):
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = \
            df_merge['FBA_amount'] - df_merge['FBS_amount']
        df_merge['Percent_difference'] = \
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. '
            'Details in Validation Log', activity_set,
            source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = \
                df_merge[(df_merge['Context'] == i) &
                         (df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for '
                    '%s %s %s at %s is equal to the '
                    'FlowByActivity FlowAmount', source_name, activity_set, i,
                    j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
示例#9
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if check_activities_sector_like(source_name):
        vLog.debug('Not comparing loaded FlowByActivity to FlowBySector '
                   'ratios for a dataset with sector-like activities because '
                   'if there are modifications to flowamounts for a sector, '
                   'then the ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector '
                'length where the flowbyactivity to flowbysector ratio '
                'is less than or greater than 1 by %s', len(comparison2),
                str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less t
        # han 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load '
            'to FlowBySector ratios for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to '
                'FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
示例#10
0
def calculate_flowamount_diff_between_dfs(dfa_load, dfb_load):
    """
    Calculate the differences in FlowAmounts between two dfs
    :param dfa_load: df, initial df
    :param dfb_load: df, modified df
    :return: df, comparing changes in flowamounts between 2 dfs
    """

    # subset the dataframes, only keeping data for easy
    # comparison of flowamounts
    drop_cols = [
        'Year', 'MeasureofSpread', 'Spread', 'DistributionType', 'Min', 'Max',
        'DataReliability', 'DataCollection'
    ]
    # drop cols and rename, ignore error if a df does not
    # contain a column to drop
    dfa = dfa_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Original'})
    dfb = dfb_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Modified'})
    # create df dict for modified dfs created in for loop
    df_list = []
    for d in ['a', 'b']:
        df_name = f'df{d}'
        # assign new column of geoscale by which to aggregate
        vars()[df_name + '2'] = vars()[df_name].assign(geoscale=np.where(
            vars()[df_name]['Location'].apply(lambda x: x.endswith('000')),
            'state', 'county'))
        vars()[df_name + '2'] = vars()[df_name + '2'].assign(
            geoscale=np.where(vars()[df_name +
                                     '2']['Location'] == '00000', 'national',
                              vars()[df_name + '2']['geoscale']))
        # ensure all nan/nones filled/match
        vars()[df_name + '2'] = \
            replace_strings_with_NoneType(vars()[df_name+'2'])
        df_list.append(vars()[df_name + '2'])
    # merge the two dataframes
    df = df_list[0].merge(df_list[1], how='outer')

    # determine if any new data is negative
    dfn = df[df['FlowAmount_Modified'] < 0].reset_index(drop=True)
    if len(dfn) > 0:
        vLog.info('There are negative FlowAmounts in new dataframe, '
                  'see Validation Log')
        vLogDetailed.info('Negative FlowAmounts in new dataframe: '
                          '\n {}'.format(dfn.to_string()))

    # Because code will sometimes change terminology, aggregate
    # data by context and flowable to compare df differences
    # subset df
    dfs = df[[
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'FlowAmount_Original', 'FlowAmount_Modified', 'Unit', 'geoscale'
    ]]
    agg_cols = [
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'Unit', 'geoscale'
    ]
    dfagg = dfs.groupby(agg_cols, dropna=False, as_index=False).agg({
        'FlowAmount_Original':
        sum,
        'FlowAmount_Modified':
        sum
    })
    # column calculating difference
    dfagg['FlowAmount_Difference'] = \
        dfagg['FlowAmount_Modified'] - dfagg['FlowAmount_Original']
    dfagg['Percent_Difference'] = (dfagg['FlowAmount_Difference'] /
                                   dfagg['FlowAmount_Original']) * 100
    # drop rows where difference = 0
    dfagg2 = dfagg[dfagg['FlowAmount_Difference'] != 0].reset_index(drop=True)
    if len(dfagg2) == 0:
        vLogDetailed.info('No FlowAmount differences')
    else:
        # subset df and aggregate, also print out the total
        # aggregate diff at the geoscale
        dfagg3 = replace_strings_with_NoneType(dfagg).drop(columns=[
            'ActivityProducedBy', 'ActivityConsumedBy',
            'FlowAmount_Difference', 'Percent_Difference'
        ])
        dfagg4 = dfagg3.groupby(['Flowable', 'Context', 'Unit', 'geoscale'],
                                dropna=False,
                                as_index=False).agg({
                                    'FlowAmount_Original': sum,
                                    'FlowAmount_Modified': sum
                                })
        # column calculating difference
        dfagg4['FlowAmount_Difference'] = \
            dfagg4['FlowAmount_Modified'] - dfagg4['FlowAmount_Original']
        dfagg4['Percent_Difference'] = (dfagg4['FlowAmount_Difference'] /
                                        dfagg4['FlowAmount_Original']) * 100
        # drop rows where difference = 0
        dfagg5 = dfagg4[dfagg4['FlowAmount_Difference'] != 0].reset_index(
            drop=True)
        vLogDetailed.info('Total FlowAmount differences between dataframes: '
                          '\n {}'.format(dfagg5.to_string(), index=False))

        # save detail output in log file
        vLogDetailed.info('Total FlowAmount differences by Activity Columns: '
                          '\n {}'.format(dfagg2.to_string(), index=False))
示例#11
0
def check_allocation_ratios(flow_alloc_df_load, activity_set, config, attr):
    """
    Check for issues with the flow allocation ratios
    :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column
    :param activity_set: str, activity set
    :param config: dictionary, method yaml
    :param attr: dictionary, activity set info
    :return: print out information regarding allocation ratios,
             save csv of results to local directory
    """
    # if in the attr dictionary, merge columns are identified,
    # the merge columns need to be accounted for in the grouping/checking of
    # allocation ratios
    if 'allocation_merge_columns' in attr:
        subset_cols = [
            'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
        ] + attr['allocation_merge_columns']
        groupcols = ['FBA_Activity', 'Location', 'SectorLength'
                     ] + attr['allocation_merge_columns']
    else:
        subset_cols = [
            'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
        ]
        groupcols = ['FBA_Activity', 'Location', 'SectorLength']

    # create column of sector lengths
    flow_alloc_df =\
        flow_alloc_df_load.assign(
            SectorLength=flow_alloc_df_load['Sector'].str.len())
    # subset df
    flow_alloc_df2 = flow_alloc_df[subset_cols]
    # sum the flow amount ratios by location and sector length
    flow_alloc_df3 = \
        flow_alloc_df2.groupby(
            groupcols, dropna=False, as_index=False).agg(
            {"FlowAmountRatio": sum})
    # keep only rows of specified sector length
    flow_alloc_df4 = flow_alloc_df3[
        flow_alloc_df3['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)
    # keep data where the flowamountratio is greater than or
    # less than 1 by 0.005
    tolerance = 0.01
    flow_alloc_df5 = flow_alloc_df4[
        (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) |
        (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)]

    if len(flow_alloc_df5) > 0:
        vLog.info(
            'There are %s instances at a sector length of %s '
            'where the allocation ratio for a location is greater '
            'than or less than 1 by at least %s. See Validation Log',
            len(flow_alloc_df5), config["target_sector_level"], str(tolerance))

    # add to validation log
    log.info(
        'Save the summary table of flow allocation ratios for each '
        'sector length for %s in validation log', activity_set)
    # if df not empty, print, if empty, print string
    if flow_alloc_df5.empty:
        vLogDetailed.info('Flow allocation ratios for %s '
                          'all round to 1', activity_set)

    else:
        vLogDetailed.info(
            'Flow allocation ratios for %s: '
            '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
示例#12
0
def allocate_usda_ers_mlu_land_in_urban_areas(df, attr, fbs_list):
    """
    This function is used to allocate the USDA_ERS_MLU activity 'land in
    urban areas' to NAICS 2012 sectors. Allocation is dependent on
    assumptions defined in 'literature_values.py' as well as results from
    allocating 'EIA_CBECS_Land' and 'EIA_MECS_Land' to land based sectors.

    Methodology is based on the manuscript:
    Lin Zeng and Anu Ramaswami
    Impact of Locational Choices and Consumer Behaviors on Personal
    Land Footprints: An Exploration Across the Urban–Rural Continuum in the
    United States
    Environmental Science & Technology 2020 54 (6), 3091-3102
    DOI: 10.1021/acs.est.9b06024

    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior
                     to the activity set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # define sector column to base calculations
    sector_col = 'SectorConsumedBy'

    vLogDetailed.info('Assuming total land use from MECS and CBECS included '
                      'in urban land area, so subtracting out calculated '
                      'MECS and CBECS land from MLU urban land area')
    # read in the cbecs and mecs df from df_list
    for df_i in fbs_list:
        if (df_i['MetaSources'] == 'EIA_CBECS_Land').all():
            cbecs = df_i
        elif (df_i['MetaSources'] == 'EIA_MECS_Land').all():
            mecs = df_i

    # load the federal highway administration fees dictionary
    fha_dict = get_transportation_sectors_based_on_FHA_fees()
    df_fha = pd.DataFrame.from_dict(
        fha_dict,
        orient='index').rename(columns={'NAICS_2012_Code': sector_col})

    # calculate total residential area from the American Housing Survey
    residential_land_area = get_area_of_urban_land_occupied_by_houses_2013()
    df_residential = df[df[sector_col] == 'F01000']
    df_residential = df_residential.assign(FlowAmount=residential_land_area)

    # make an assumption about the percent of urban area that is open space
    openspace_multiplier = get_open_space_fraction_of_urban_area()
    df_openspace = df[df[sector_col] == '712190']
    df_openspace = df_openspace.assign(FlowAmount=df_openspace['FlowAmount'] *
                                       openspace_multiplier)

    # sum all uses of urban area that are NOT transportation
    # first concat dfs for residential, openspace, commercial,
    # and manufacturing land use
    df_non_urban_transport_area = pd.concat(
        [df_residential, df_openspace, cbecs, mecs],
        sort=False,
        ignore_index=True)
    df_non_urban_transport_area = \
        df_non_urban_transport_area[['Location', 'Unit', 'FlowAmount']]
    non_urban_transport_area_sum = df_non_urban_transport_area.groupby(
        ['Location', 'Unit'], as_index=False).agg({
            'FlowAmount': sum
        }).rename(columns={'FlowAmount': 'NonTransport'})
    # compare units
    compare_df_units(df, df_non_urban_transport_area)
    # calculate total urban transportation by subtracting
    # calculated areas from total urban land
    df_transport = df.merge(non_urban_transport_area_sum, how='left')
    df_transport = df_transport.assign(FlowAmount=df_transport['FlowAmount'] -
                                       df_transport['NonTransport'])
    df_transport.drop(columns=['NonTransport'], inplace=True)

    # make an assumption about the percent of urban transport
    # area used by airports
    airport_multiplier = get_urban_land_use_for_airports()
    df_airport = df_transport[df_transport[sector_col] == '488119']
    df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] *
                                   airport_multiplier)

    # make an assumption about the percent of urban transport
    # area used by railroads
    railroad_multiplier = get_urban_land_use_for_railroads()
    df_railroad = df_transport[df_transport[sector_col] == '482112']
    df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] *
                                     railroad_multiplier)

    # further allocate the remaining urban transportation area using
    # Federal Highway Administration fees
    # first subtract area for airports and railroads
    air_rail_area = pd.concat([df_airport, df_railroad], sort=False)
    air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']]
    air_rail_area_sum = air_rail_area.groupby(
        ['Location', 'Unit'], as_index=False).agg({
            'FlowAmount': sum
        }).rename(columns={'FlowAmount': 'AirRail'})

    df_highway = df_transport.merge(air_rail_area_sum, how='left')
    df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] -
                                   df_highway['AirRail'])
    df_highway.drop(columns=['AirRail'], inplace=True)

    # add fed highway administration fees
    df_highway2 = df_highway.merge(df_fha, how='left')
    df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()]
    df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] *
                                     df_highway2['ShareOfFees'])
    df_highway2.drop(columns=['ShareOfFees'], inplace=True)

    # concat all df subsets
    allocated_urban_areas_df = pd.concat(
        [df_residential, df_openspace, df_airport, df_railroad, df_highway2],
        ignore_index=True,
        sort=False).reset_index(drop=True)

    return allocated_urban_areas_df
示例#13
0
def calculate_net_public_supply(df_load):
    """
    USGS Provides info on the quantity of public supply withdrawals that
    are delivered to domestic use. The USGS PS withdrawals are not necessarily
    greater than/equal to the Domestic deliveries because water can be
    withdrawn in one county and delivered in another (water can also cross
    state lines). Therefore, can/do end up with NEGATIVE net public supply
    values and PS water should only be used at a national level

    Domestic deliveries are subtracted from public supply. An assumption is
    made that PS deliveries to domestic is fresh water. The national level
    data can then be allocated to end users using the BEA Use tables.
    :param df_load: USGS df
    :return: df with net public supply values
    """

    # subset into 2 dfs, one that contains PS data and one that does not
    df1 = df_load[(df_load[fba_activity_fields[0]] == 'Public Supply') |
                  (df_load[fba_activity_fields[1]] == 'Public Supply')]
    df2 = df_load[(df_load[fba_activity_fields[0]] != 'Public Supply')
                  & (df_load[fba_activity_fields[1]] != 'Public Supply')]

    # drop all deliveries to thermo and industrial
    # (not enough states report the data to make usable)
    df1_sub = df1[~df1[fba_activity_fields[1]].isin([
        'Industrial', 'Thermoelectric Power',
        'Thermoelectric Power Closed-loop cooling',
        'Thermoelectric Power Once-through cooling'
    ])]
    # drop duplicate info of "Public Supply deliveries to"
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("Public Supply total deliveries")]
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("deliveries from public supply")]

    # calculate data drop
    vLogDetailed.info('Dropping rows that contain "deliveries from public '
                      'supply" to avoid double counting with rows of "Public '
                      'Supply deliveries to"')
    calculate_flowamount_diff_between_dfs(df1, df1_sub)

    # drop county level values because cannot use county data
    vLogDetailed.info('Dropping county level public supply withdrawals '
                      'because will end up with negative values due to '
                      'instances of water deliveries coming from surrounding '
                      'counties')
    df1_sub = df1_sub[df1_sub['Location'].apply(
        lambda x: x[2:6] == '000')].reset_index(drop=True)

    # df of ps delivered and ps withdrawn and us total
    df_d = df1_sub[df1_sub[fba_activity_fields[0]] == 'Public Supply']
    df_w = df1_sub[df1_sub[fba_activity_fields[1]] == 'Public Supply']
    df_us = df1_sub[df1_sub['Location'] == '00000']
    # split consumed further into fresh water (assumption domestic
    # deliveries are freshwater) assumption that water withdrawal taken
    # equally from ground and surface
    df_w1 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] != 'total')]
    df_w2 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] == 'total')]
    # compare units
    compare_df_units(df_w1, df_w2)
    df_wm = pd.merge(df_w1,
                     df_w2[['FlowAmount', 'Location', 'Unit']],
                     how='left',
                     left_on=['Location', 'Unit'],
                     right_on=['Location', 'Unit'])
    df_wm = df_wm.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "FlowTotal"
    })
    # compare units
    compare_df_units(df_wm, df_d)
    # merge the deliveries to domestic
    df_w_modified = pd.merge(df_wm,
                             df_d[['FlowAmount', 'Location']],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_w_modified = df_w_modified.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "DomesticDeliveries"
    })

    # create flowratio for ground/surface
    df_w_modified.loc[:, 'FlowRatio'] = \
        df_w_modified['FlowAmount'] / df_w_modified['FlowTotal']
    # calculate new, net total public supply withdrawals
    # will end up with negative values due to instances of water
    # deliveries coming form surrounding counties
    df_w_modified.loc[:, 'FlowAmount'] = \
        df_w_modified['FlowAmount'] - (df_w_modified['FlowRatio'] *
                                       df_w_modified['DomesticDeliveries'])

    net_ps = df_w_modified.drop(columns=["FlowTotal", "DomesticDeliveries"])

    # compare units
    compare_df_units(df_d, net_ps)
    # because assuming domestic is all fresh, drop
    # flowname/flowable/Compartment/context
    # and instead use those column data from the net_ps df
    df_d_modified = df_d.drop(
        columns=['FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID'])
    # Also allocate to ground/surface from state ratios
    df_d_modified = pd.merge(df_d_modified,
                             net_ps[[
                                 'FlowName', 'Flowable', 'Compartment',
                                 'Context', 'FlowUUID', 'Location', 'FlowRatio'
                             ]],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_d_modified.loc[:, 'FlowAmount'] = \
        df_d_modified['FlowAmount'] * df_d_modified['FlowRatio']
    df_d_modified = df_d_modified.drop(columns=["FlowRatio"])

    net_ps = net_ps.drop(columns=["FlowRatio"])

    # concat dfs back (non-public supply, public supply
    # deliveries, net ps withdrawals)
    modified_ps = pd.concat([df2, df_d_modified, net_ps, df_us],
                            ignore_index=True)

    return modified_ps
示例#14
0
def equally_allocate_parent_to_child_naics(df_load, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at
    target sector level.
    Equally allocate parent NAICS to child NAICS where child NAICS missing
    :param df_load: df, FBS format
    :param target_sector_level: str, target NAICS level for FBS output
    :return: df, with all child NAICS at target sector level
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df_load)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = \
            df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) &
                   (df[fbs_activity_fields[1]] == '')]
        df_x2 = \
            df.loc[(df[fbs_activity_fields[0]] == '') &
                   (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = \
            df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) &
                   (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        if len(df_x) > 0:
            # create df of i + 1 length
            df_y1 = df.loc[
                df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
                | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
            df_y2 = df.loc[
                df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
                & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
            df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

            # create temp sector columns in df y, that are i digits in length
            df_y.loc[:, 'spb_tmp'] = \
                df_y[fbs_activity_fields[0]].apply(lambda x: x[0:i])
            df_y.loc[:, 'scb_tmp'] = \
                df_y[fbs_activity_fields[1]].apply(lambda x: x[0:i])
            # don't modify household sector lengths or gov't transport
            df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

            # merge the two dfs
            if 'Context' in df_y.columns:
                merge_cols = [
                    'Class', 'Context', 'FlowType', 'Flowable', 'Location',
                    'LocationSystem', 'Unit', 'Year'
                ]
            else:
                merge_cols = [
                    'Class', 'FlowType', 'Location', 'LocationSystem', 'Unit',
                    'Year'
                ]

            df_m = pd.merge(df_x,
                            df_y[merge_cols + ['spb_tmp', 'scb_tmp']],
                            how='left',
                            left_on=merge_cols +
                            ['SectorProducedBy', 'SectorConsumedBy'],
                            right_on=merge_cols + ['spb_tmp', 'scb_tmp'])

            # extract the rows that are not disaggregated to more
            # specific naics
            rl = df_m[(df_m['scb_tmp'].isnull())
                      & (df_m['spb_tmp'].isnull())].reset_index(drop=True)
            # clean df
            rl = replace_strings_with_NoneType(rl)
            rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']]\
                .drop_duplicates().values.tolist()

            # match sectors with target sector length sectors
            # import cw and subset to current sector length
            # and target sector length
            cw_load = load_crosswalk('sector_length')
            nlength = list(sector_level_key.keys())[list(
                sector_level_key.values()).index(i)]
            cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
            # add column with counts
            cw['sector_count'] = \
                cw.groupby(nlength)[nlength].transform('count')

            # merge df & conditionally replace sector produced/consumed columns
            rl_m = pd.merge(rl,
                            cw,
                            how='left',
                            left_on=[fbs_activity_fields[0]],
                            right_on=[nlength])
            rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                     fbs_activity_fields[0]] = rl_m[target_sector_level]
            rl_m = rl_m.drop(columns=[nlength, target_sector_level])

            rl_m2 = pd.merge(rl_m,
                             cw,
                             how='left',
                             left_on=[fbs_activity_fields[1]],
                             right_on=[nlength])
            rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                      fbs_activity_fields[1]] = rl_m2[target_sector_level]
            rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

            # create one sector count column
            rl_m2['sector_count_x'] = \
                rl_m2['sector_count_x'].fillna(rl_m2['sector_count_y'])
            rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
            rl_m3 = rl_m3.drop(columns=['sector_count_y'])

            # calculate new flow amounts, based on sector count,
            # allocating equally to the new sector length codes
            rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
            rl_m3 = rl_m3.drop(columns=['sector_count', 'spb_tmp', 'scb_tmp'])

            # append to df
            if len(rl) != 0:
                vLogDetailed.warning(
                    'Data found at %s digit NAICS not '
                    'represented in current data subset: '
                    '{}'.format(' '.join(map(str, rl_list))), str(i))
                rows_lost = rows_lost.append(rl_m3, ignore_index=True)

    if len(rows_lost) != 0:
        vLogDetailed.info(
            'Allocating FlowAmounts equally to '
            'each %s associated with the sectors previously '
            'dropped', target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data