示例#1
0
def apply_tons_per_employee_per_year_to_states(fbs):
    """
    Calculates tons per employee per year based on BLS_QCEW employees
    by sector and applies that quantity to employees in all states
    """
    bls = load_fba_w_standardized_units(datasource='BLS_QCEW',
                                        year=fbs['Year'].unique()[0],
                                        flowclass='Employment',
                                        geographic_level='state')
    bls = bls[bls['FlowName'] == 'Number of employees']
    # clean df
    bls = clean_bls_qcew_fba(bls)
    bls = add_sectors_to_flowbyactivity(bls)

    # Subset BLS dataset
    sector_list = list(filter(None, fbs['SectorProducedBy'].unique()))
    bls = get_fba_allocation_subset(bls, 'BLS_QCEW', sector_list)
    bls = bls.rename(columns={'FlowAmount': 'Employees'})
    bls = bls[['Employees', 'Location', 'Year', 'SectorProducedBy']]

    # Calculate tons per employee per year per material and sector in CA
    bls_CA = bls[bls['Location'] == '06000']  # California
    tpepy = fbs.merge(bls_CA, how='inner')
    tpepy['TPEPY'] = np.divide(tpepy['FlowAmount'],
                               tpepy['Employees'],
                               out=np.zeros_like(tpepy['Employees']),
                               where=tpepy['Employees'] != 0)
    tpepy = tpepy.drop(columns=['Employees', 'FlowAmount', 'Location'])

    # Apply TPEPY back to all employees in all states
    national_waste = tpepy.merge(bls, how='outer')
    national_waste['FlowAmount'] = \
        national_waste['Employees'] * national_waste['TPEPY']

    return national_waste
示例#2
0
def scale_blackhurst_results_to_usgs_values(df_load, attr,
                                            download_FBA_if_missing):
    """
    Scale the initial estimates for Blackhurst-based mining estimates to
    USGS values. Oil-based sectors are allocated a larger percentage of the
    difference between initial water withdrawal estimates and published USGS
    values.

    This method is based off the Water Satellite Table created by Yang and
    Ingwersen, 2017
    :param df_load: df, fba dataframe to be modified
    :param attr: dictionary, attribute data from method yaml for activity set
    :param download_FBA_if_missing: bool, indicate if missing FBAs should be
        downloaded from Data Commons
    :return: scaled fba results
    """
    # determine national level published withdrawal data for usgs mining
    # in FBS method year
    pv_load = load_fba_w_standardized_units(
        datasource="USGS_NWIS_WU",
        year=str(attr['helper_source_year']),
        flowclass='Water',
        download_FBA_if_missing=download_FBA_if_missing)

    pv_sub = pv_load[(pv_load['ActivityConsumedBy'] == 'Mining')
                     & (pv_load['Compartment'] == 'total') &
                     (pv_load['FlowName'] == 'total')].reset_index(drop=True)
    # rename the published value flow name and merge with Blackhurst data
    pv_sub = pv_sub.rename(columns={'FlowAmount': 'pv'})
    df = df_load.merge(pv_sub[['Location', 'pv']], how='left')
    # calculate the difference between published value and allocated value
    # for each naics length
    df = df.assign(nLen=df['SectorConsumedBy'].apply(lambda x: len(x)))
    # calculate initial FlowAmount accounted for
    df = df.assign(av=df.groupby('nLen')['FlowAmount'].transform('sum'))
    # calc difference
    df = df.assign(vd=df['pv'] - df['av'])

    # subset df to scale into oil and non-oil sectors
    df['sector_label'] = np.where(
        df['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'), 'oil',
        'nonoil')
    df['ratio'] = np.where(df['sector_label'] == 'oil', 2 / 3, 1 / 3)
    df['label_sum'] = df.groupby(['Location', 'nLen', 'sector_label'
                                  ])['FlowAmount'].transform('sum')

    # calculate revised water withdrawal allocation
    df_scaled = df.copy()
    df_scaled.loc[:, 'FlowAmount'] = \
        df_scaled['FlowAmount'] + \
        (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \
        (df_scaled['ratio'] * df_scaled['vd'])
    df_scaled = df_scaled.drop(columns=[
        'sector_label', 'ratio', 'nLen', 'label_sum', 'pv', 'av', 'vd'
    ])

    return df_scaled
示例#3
0
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass,
                       geoscale_from, geoscale_to, **kwargs):
    """
    Load, clean, and map a FlowByActivity df
    :param method: dictionary, FBS method yaml
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fba_sourcename: str, source name
    :param df_year: str, year
    :param flowclass: str, flowclass to subset df with
    :param geoscale_from: str, geoscale to use
    :param geoscale_to: str, geoscale to aggregate to
    :param kwargs: dictionary, can include parameters: 'allocation_flow',
                   'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec'
    :return: df, fba format
    """

    log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year))
    fba = load_fba_w_standardized_units(datasource=fba_sourcename,
                                        year=df_year,
                                        flowclass=flowclass)

    # check if allocation data exists at specified geoscale to use
    log.info("Checking if allocation data exists at the %s level", geoscale_from)
    check_if_data_exists_at_geoscale(fba, geoscale_from)

    # aggregate geographically to the scale of the flowbyactivty source, if necessary
    fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to)

    # subset based on yaml settings
    if 'flowname_subset' in kwargs:
        if kwargs['flowname_subset'] != 'None':
            fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])]

    if 'compartment_subset' in kwargs:
        if kwargs['compartment_subset'] != 'None':
            fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])]
    # cleanup the fba allocation df, if necessary
    if 'clean_fba' in kwargs:
        log.info("Cleaning %s", fba_sourcename)
        fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr)
    # reset index
    fba = fba.reset_index(drop=True)

    # assign sector to allocation dataset
    log.info("Adding sectors to %s", fba_sourcename)
    fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source'])

    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
    if 'clean_fba_w_sec' in kwargs:
        log.info("Further disaggregating sectors in %s", fba_sourcename)
        fba_wsec = dynamically_import_fxn(fba_sourcename,
                                          kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr,
                                                                     method=method,
                                                                     sourcename=fba_sourcename)

    return fba_wsec
示例#4
0
def iwms_aggregation(df_load, **kwargs):
    """
    Before multiplying the USDA CoA Cropland data by IWMS data,
    first aggregate the two hay values from IWMS
    :param df_load:
    :param kwargs:
    :return:
    """

    # load the acreage information for iwms
    land_load = load_fba_w_standardized_units(
        "USDA_IWMS",
        year=kwargs['attr']['helper_source_year'],
        flowclass="Land",
        geographic_level="state")

    # subset to hay and haylage
    land = land_load[land_load['ActivityConsumedBy'].isin(
        ['HAY & HAYLAGE, (EXCL ALFALFA)', 'HAY & HAYLAGE, ALFALFA'])]
    land_sub = land[['ActivityConsumedBy', 'FlowAmount',
                     'Location']].reset_index(drop=True)
    land_sub = land_sub.rename(columns={'FlowAmount': 'HelperFlow'})

    # merge the two dfs
    df = pd.merge(df_load, land_sub, how='right')
    df['HelperFlow'] = df['HelperFlow'].fillna(1)
    # drop rows where flow is 0
    df = df[df['FlowAmount'] != 0]
    # reset hay sectors and rename
    df['SectorConsumedBy'] = np.where(
        df['SectorConsumedBy'].isin(['111940A', '111940B']), '11194',
        df['SectorConsumedBy'])
    df['ActivityConsumedBy'] = np.where(df['SectorConsumedBy'] == '11194',
                                        'HAY & HAYLAGE',
                                        df['ActivityConsumedBy'])

    wt_flow = df.groupby(df['Location']).apply(lambda x: np.average(
        x['FlowAmount'], weights=x['HelperFlow'])).reset_index()
    wt_flow = wt_flow.rename(columns={wt_flow.columns[1]: 'NewFlow'})

    df2 = df.merge(wt_flow)
    # reset flowamount, drop duplicates, drop columns
    df2 = df2.assign(FlowAmount=df2['NewFlow']).drop(
        columns=['HelperFlow', 'NewFlow'])
    df3 = df2.drop_duplicates()

    # drop data from original, add in modifed data
    df_o = df_load[~df_load['SectorConsumedBy'].isin(['111940A', '111940B'])]
    df4 = pd.concat([df_o, df3], ignore_index=True)

    return df4
示例#5
0
def scale_blackhurst_results_to_usgs_values(df_to_scale, attr):
    """
    Scale the initial estimates for Blackhurst-based mining estimates to USGS values.
    Oil-based sectors are allocated a larger percentage of the difference between initial
    water withdrawal estimates and published USGS values.

    This method is based off the Water Satellite Table created by Yang and Ingwersen, 2017
    :param df_to_scale: df, fba dataframe to be modified
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: scaled fba results
    """

    # determine national level published withdrawal data for usgs mining in FBS method year
    pv_load = load_fba_w_standardized_units(datasource="USGS_NWIS_WU",
                                            year=str(
                                                attr['helper_source_year']),
                                            flowclass='Water')

    pv_sub = pv_load[(pv_load['Location'] == str(US_FIPS)) & (
        pv_load['ActivityConsumedBy'] == 'Mining')].reset_index(drop=True)
    pv = pv_sub['FlowAmount'].loc[
        0] * 1000000  # usgs unit is Mgal, blackhurst unit is gal

    # sum quantity of water withdrawals already allocated to sectors
    av = df_to_scale['FlowAmount'].sum()

    # calculate the difference between published value and allocated value
    vd = pv - av

    # subset df to scale into oil and non-oil sectors
    df_to_scale['sector_label'] = np.where(
        df_to_scale['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'),
        'oil', 'nonoil')
    df_to_scale['ratio'] = np.where(df_to_scale['sector_label'] == 'oil',
                                    2 / 3, 1 / 3)
    df_to_scale['label_sum'] = df_to_scale.groupby(
        ['Location', 'sector_label'])['FlowAmount'].transform('sum')
    df_to_scale.loc[:, 'value_difference'] = vd.astype(float)

    # calculate revised water withdrawal allocation
    df_scaled = df_to_scale.copy()
    df_scaled.loc[:, 'FlowAmount'] = df_scaled['FlowAmount'] + \
                                     (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \
                                     (df_scaled['ratio'] * df_scaled['value_difference'])
    df_scaled = df_scaled.drop(
        columns=['sector_label', 'ratio', 'label_sum', 'value_difference'])

    return df_scaled
示例#6
0
def convert_blackhurst_data_to_kg_per_year(df, **kwargs):
    """
    Load BEA Make After Redefinition data to convert Blackhurst IO
    dataframe units to gallon per year
    :param df: df, FBA format
    :param kwargs: kwargs includes "attr" - dictionary, attribute
    data from method yaml for activity set
    :return: transformed fba df
    """

    # load the bea make table
    bmt = load_fba_w_standardized_units(
        datasource='BEA_Make_AR',
        year=kwargs['attr']['allocation_source_year'],
        flowclass='Money',
        download_FBA_if_missing=kwargs['download_FBA_if_missing'])
    # drop rows with flowamount = 0
    bmt = bmt[bmt['FlowAmount'] != 0]

    # check on units of dfs before merge
    compare_df_units(df, bmt)
    bh_df_revised = pd.merge(
        df,
        bmt[['FlowAmount', 'ActivityProducedBy', 'Location']],
        left_on=['ActivityConsumedBy', 'Location'],
        right_on=['ActivityProducedBy', 'Location'])

    bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) *
                                          (bh_df_revised['FlowAmount_y']))
    bh_df_revised.loc[:, 'Unit'] = 'kg'
    # drop columns
    bh_df_revised = bh_df_revised.drop(
        columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y'])
    bh_df_revised = bh_df_revised.rename(
        columns={"ActivityProducedBy_x": "ActivityProducedBy"})

    return bh_df_revised
示例#7
0
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland
    at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector: df, CoA cropland data, FBA format with sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data
    :param sector_column: str, the sector column on which to make
                          df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, CoA cropland data disaggregated
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3])
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS",
                                          year=year,
                                          flowclass='Land')
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(
        drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(
        naics, sectorsourcename=method['target_sector_source'])
    # estimate suppressed data by equally allocating parent to child naics
    naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3,
                                     'USDA_CoA_Cropland_NAICS')
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(
        Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # check units in prep for merge
    compare_df_units(crop, naics4)
    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(
            Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = \
            naics4.loc[naics4[sector_column].apply(lambda x:
                                                   len(x) == i + 1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(
            Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 sector_column, 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=[sector_column, 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(
            columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # equally allocate any further missing naics
    crop = allocate_dropped_sector_data(crop, 'NAICS_6')

    # pasture data
    pasture = \
        fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x:
                                                           x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
示例#8
0
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links
    to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data being disaggregated
    :param sector_column: str, the sector column on which to make df
                          modifications (SectorProducedBy or SectorConsumedBy)
    :return: df, the CoA cropland dataframe with disaggregated pastureland data
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) ==
                         '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))

        # load usda coa cropland naics
        df_f = load_fba_w_standardized_units(
            datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land')
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) ==
                        '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(
            df_f, sectorsourcename=method['target_sector_source'])
        # estimate suppressed data by equal allocation
        df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3,
                                        'USDA_CoA_Cropland_NAICS')
        # create proportional ratios
        group_cols = fba_wsec_default_grouping_fields
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df_f = allocate_by_sector(df_f, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # check units before merge
        compare_df_units(p, df_f)
        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']],
                     how='left',
                     left_on="Location_tmp",
                     right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=[
            'Location_tmp', sector_column +
            '_x', 'Location_y', 'FlowAmountRatio'
        ])
        df = df.rename(columns={
            sector_column + '_y': sector_column,
            "Location_x": 'Location'
        })

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(
            lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df],
                                 sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
示例#9
0
def convert_statcan_data_to_US_water_use(df, attr):
    """
    Use Canadian GDP data to convert 3 digit canadian water use to us water
    use:
    - canadian gdp
    - us gdp
    :param df: df, FBA format
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, FBA format, flowamounts converted
    """

    # load Canadian GDP data
    gdp = load_fba_w_standardized_units(datasource='StatCan_GDP',
                                        year=attr['allocation_source_year'],
                                        flowclass='Money')

    # drop 31-33
    gdp = gdp[gdp['ActivityProducedBy'] != '31-33']
    gdp = gdp.rename(columns={"FlowAmount": "CanDollar"})

    # check units before merge
    compare_df_units(df, gdp)
    # merge df
    df_m = pd.merge(df,
                    gdp[['CanDollar', 'ActivityProducedBy']],
                    how='left',
                    left_on='ActivityConsumedBy',
                    right_on='ActivityProducedBy')
    df_m['CanDollar'] = df_m['CanDollar'].fillna(0)
    df_m = df_m.drop(columns=["ActivityProducedBy_y"])
    df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"})
    df_m = df_m[df_m['CanDollar'] != 0]

    exchange_rate = get_Canadian_to_USD_exchange_rate(
        str(attr['allocation_source_year']))
    exchange_rate = float(exchange_rate)
    # convert to mgal/USD
    df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] /
                                                      exchange_rate)
    df_m.loc[:, 'Unit'] = 'Mgal/USD'

    df_m = df_m.drop(columns=["CanDollar"])

    # convert Location to US
    df_m.loc[:, 'Location'] = US_FIPS
    df_m = assign_fips_location_system(df_m,
                                       str(attr['allocation_source_year']))

    # load us gdp
    # load Canadian GDP data
    us_gdp_load = load_fba_w_standardized_units(
        datasource='BEA_GDP_GrossOutput',
        year=attr['allocation_source_year'],
        flowclass='Money')

    # load bea crosswalk
    cw_load = load_bea_crosswalk()
    cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates()
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True)

    # merge
    us_gdp = pd.merge(us_gdp_load,
                      cw,
                      how='left',
                      left_on='ActivityProducedBy',
                      right_on='BEA_2012_Detail_Code')
    us_gdp = us_gdp.drop(
        columns=['ActivityProducedBy', 'BEA_2012_Detail_Code'])
    # rename columns
    us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'})
    # agg by naics
    us_gdp = aggregator(us_gdp, fba_default_grouping_fields)
    us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'})

    # determine annual us water use
    df_m2 = pd.merge(df_m,
                     us_gdp[['ActivityProducedBy', 'us_gdp']],
                     how='left',
                     left_on='ActivityConsumedBy',
                     right_on='ActivityProducedBy')

    df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp'])
    df_m2.loc[:, 'Unit'] = 'Mgal'
    df_m2 = df_m2.rename(
        columns={'ActivityProducedBy_x': 'ActivityProducedBy'})
    df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp'])

    return df_m2
示例#10
0
def convert_blackhurst_data_to_kg_per_employee(df_wsec, attr, method,
                                               **kwargs):
    """
    Load BLS employment data and use to transform original units to
    gallons per employee
    :param df_wsec: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :return: df, transformed fba dataframe with sector columns
    """

    # load 2002 employment data
    bls = load_fba_w_standardized_units(
        datasource='BLS_QCEW',
        year='2002',
        flowclass='Employment',
        geographic_level='national',
        download_FBA_if_missing=kwargs['download_FBA_if_missing'])

    # clean df
    bls = clean_bls_qcew_fba(bls, attr=attr)

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={
        'SectorProducedBy': 'Sector',
        'FlowAmount': 'HelperFlow'
    })

    # check units before merge
    compare_df_units(df_wsec, bls_wsec)
    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Location', 'Sector', 'HelperFlow']],
                  how='left',
                  left_on=['Location', 'SectorConsumedBy'],
                  right_on=['Location', 'Sector'])
    # drop any rows where sector is None
    df = df[~df['Sector'].isnull()]
    # fill helperflow values with 0
    df['HelperFlow'] = df['HelperFlow'].fillna(0)

    # calculate proportional ratios
    df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector')

    df_wratio = df_wratio.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'HelperFlow': 'Employees'
    })

    # drop rows where helperflow = 0
    df_wratio = df_wratio[df_wratio['Employees'] != 0]

    # calculate gal/employee in 2002
    df_wratio.loc[:, 'FlowAmount'] = \
        (df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / \
        df_wratio['Employees']
    df_wratio.loc[:, 'Unit'] = 'kg/p'

    # drop cols
    df_wratio = df_wratio.drop(
        columns=['Sector', 'Employees', 'EmployeeRatio'])

    return df_wratio