Python compare_df_units 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: flowsa.validation

메소드/함수: compare_df_units

hotexamples.com에서의 예제들: 11

Python compare_df_units - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 flowsa.validation.compare_df_units에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def convert_blackhurst_data_to_kg_per_year(df, **kwargs):
    """
    Load BEA Make After Redefinition data to convert Blackhurst IO
    dataframe units to gallon per year
    :param df: df, FBA format
    :param kwargs: kwargs includes "attr" - dictionary, attribute
    data from method yaml for activity set
    :return: transformed fba df
    """

    # load the bea make table
    bmt = load_fba_w_standardized_units(
        datasource='BEA_Make_AR',
        year=kwargs['attr']['allocation_source_year'],
        flowclass='Money',
        download_FBA_if_missing=kwargs['download_FBA_if_missing'])
    # drop rows with flowamount = 0
    bmt = bmt[bmt['FlowAmount'] != 0]

    # check on units of dfs before merge
    compare_df_units(df, bmt)
    bh_df_revised = pd.merge(
        df,
        bmt[['FlowAmount', 'ActivityProducedBy', 'Location']],
        left_on=['ActivityConsumedBy', 'Location'],
        right_on=['ActivityProducedBy', 'Location'])

    bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) *
                                          (bh_df_revised['FlowAmount_y']))
    bh_df_revised.loc[:, 'Unit'] = 'kg'
    # drop columns
    bh_df_revised = bh_df_revised.drop(
        columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y'])
    bh_df_revised = bh_df_revised.rename(
        columns={"ActivityProducedBy_x": "ActivityProducedBy"})

    return bh_df_revised

예제 #2

파일 보기

파일: USDA_CoA_Cropland.py 프로젝트: modelearth/flowsa

def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland
    at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector: df, CoA cropland data, FBA format with sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data
    :param sector_column: str, the sector column on which to make
                          df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, CoA cropland data disaggregated
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3])
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS",
                                          year=year,
                                          flowclass='Land')
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(
        drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(
        naics, sectorsourcename=method['target_sector_source'])
    # estimate suppressed data by equally allocating parent to child naics
    naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3,
                                     'USDA_CoA_Cropland_NAICS')
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(
        Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # check units in prep for merge
    compare_df_units(crop, naics4)
    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(
            Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = \
            naics4.loc[naics4[sector_column].apply(lambda x:
                                                   len(x) == i + 1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(
            Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 sector_column, 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=[sector_column, 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(
            columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # equally allocate any further missing naics
    crop = allocate_dropped_sector_data(crop, 'NAICS_6')

    # pasture data
    pasture = \
        fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x:
                                                           x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector

예제 #3

파일 보기

파일: USDA_CoA_Cropland.py 프로젝트: modelearth/flowsa

def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links
    to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data being disaggregated
    :param sector_column: str, the sector column on which to make df
                          modifications (SectorProducedBy or SectorConsumedBy)
    :return: df, the CoA cropland dataframe with disaggregated pastureland data
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) ==
                         '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))

        # load usda coa cropland naics
        df_f = load_fba_w_standardized_units(
            datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land')
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) ==
                        '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(
            df_f, sectorsourcename=method['target_sector_source'])
        # estimate suppressed data by equal allocation
        df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3,
                                        'USDA_CoA_Cropland_NAICS')
        # create proportional ratios
        group_cols = fba_wsec_default_grouping_fields
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df_f = allocate_by_sector(df_f, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # check units before merge
        compare_df_units(p, df_f)
        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']],
                     how='left',
                     left_on="Location_tmp",
                     right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=[
            'Location_tmp', sector_column +
            '_x', 'Location_y', 'FlowAmountRatio'
        ])
        df = df.rename(columns={
            sector_column + '_y': sector_column,
            "Location_x": 'Location'
        })

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(
            lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df],
                                 sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector

예제 #4

파일 보기

def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v,
                              aset, aset_names, download_FBA_if_missing):
    """
    Method of allocation using a specified data source
    :param flow_subset_mapped: FBA subset mapped using federal
        elementary flow list
    :param attr: dictionary, attribute data from method yaml for activity set
    :param names: list, activity names in activity set
    :param method: dictionary, FBS method yaml
    :param k: str, the datasource name
    :param v: dictionary, the datasource parameters
    :param aset: dictionary items for FBS method yaml
    :param aset_names: list, activity set names
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons
    :return: df, allocated activity names
    """

    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'allocation_flow' in attr:
        fba_dict['flowname_subset'] = attr['allocation_flow']
    if 'allocation_compartment' in attr:
        fba_dict['compartment_subset'] = attr['allocation_compartment']
    if 'clean_allocation_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_allocation_fba']
    if 'clean_allocation_fba_w_sec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec']

    # load the allocation FBA
    fba_allocation_wsec = \
        load_map_clean_fba(method, attr,
                           fba_sourcename=attr['allocation_source'],
                           df_year=attr['allocation_source_year'],
                           flowclass=attr['allocation_source_class'],
                           geoscale_from=attr['allocation_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # subset fba datasets to only keep the sectors associated
    # with activity subset
    log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k)
    fba_allocation_subset = \
        get_fba_allocation_subset(fba_allocation_wsec, k, names,
                                  flowSubsetMapped=flow_subset_mapped,
                                  allocMethod=attr['allocation_method'])

    # if there is an allocation helper dataset, modify allocation df
    if 'helper_source' in attr:
        log.info("Using the specified allocation help for subset of %s",
                 attr['allocation_source'])
        fba_allocation_subset = \
            allocation_helper(fba_allocation_subset, attr, method, v,
                              download_FBA_if_missing=download_FBA_if_missing)

    # create flow allocation ratios for each activity
    flow_alloc_list = []
    if 'Context' in fba_allocation_subset.columns:
        group_cols = fba_mapped_wsec_default_grouping_fields
    else:
        group_cols = fba_wsec_default_grouping_fields
    group_cols = [
        e for e in group_cols
        if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
    ]
    n_allocated = []
    for n in names:
        log.debug("Creating allocation ratios for %s", n)
        # if n has already been called, drop all rows of data
        # containing n to avoid double counting when there are two
        # activities in each ACB and APB columns
        fba_allocation_subset = fba_allocation_subset[~(
            (fba_allocation_subset[fba_activity_fields[0]].isin(n_allocated)) |
            (fba_allocation_subset[fba_activity_fields[1]].isin(n_allocated))
        )].reset_index(drop=True)
        fba_allocation_subset_2 = \
            get_fba_allocation_subset(fba_allocation_subset, k, [n],
                                      flowSubsetMapped=flow_subset_mapped,
                                      allocMethod=attr['allocation_method'],
                                      activity_set_names=aset_names)
        if len(fba_allocation_subset_2) == 0:
            log.info("No data found to allocate %s", n)
        else:
            flow_alloc = \
                allocate_by_sector(fba_allocation_subset_2, attr,
                                   attr['allocation_method'], group_cols,
                                   flowSubsetMapped=flow_subset_mapped)
            flow_alloc = flow_alloc.assign(FBA_Activity=n)
            n_allocated.append(n)
            flow_alloc_list.append(flow_alloc)
    flow_allocation = pd.concat(flow_alloc_list, ignore_index=True)

    # generalize activity field names to enable link to main fba source
    log.info("Generalizing activity columns in subset of %s",
             attr['allocation_source'])
    flow_allocation = collapse_activity_fields(flow_allocation)

    # check for issues with allocation ratios
    check_allocation_ratios(flow_allocation, aset, method, attr)

    # create list of sectors in the flow allocation df,
    # drop any rows of data in the flow df that aren't in list
    sector_list = flow_allocation['Sector'].unique().tolist()

    # subset fba allocation table to the values in the activity
    # list, based on overlapping sectors
    flow_subset_mapped = flow_subset_mapped.loc[
        (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) |
        (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))]

    # check if fba and allocation dfs have the same LocationSystem
    log.info("Checking if flowbyactivity and allocation "
             "dataframes use the same location systems")
    check_if_location_systems_match(flow_subset_mapped, flow_allocation)

    # merge fba df w/flow allocation dataset
    log.info("Merge %s and subset of %s", k, attr['allocation_source'])
    for i, j in activity_fields.items():
        # check units
        compare_df_units(flow_subset_mapped, flow_allocation)
        # create list of columns to merge on
        if 'allocation_merge_columns' in attr:
            fa_cols = \
                ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] + \
                attr['allocation_merge_columns']
            l_cols = \
                ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] + \
                attr['allocation_merge_columns']
            r_cols = ['Location', 'Sector', 'FBA_Activity'] + \
                     attr['allocation_merge_columns']
        else:
            fa_cols = ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity']
            l_cols = ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]]
            r_cols = ['Location', 'Sector', 'FBA_Activity']
        flow_subset_mapped = \
            flow_subset_mapped.merge(flow_allocation[fa_cols], left_on=l_cols,
                                     right_on=r_cols, how='left')

    # merge the flowamount columns
    flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\
        flow_subset_mapped['FlowAmountRatio_x'].fillna(
            flow_subset_mapped['FlowAmountRatio_y'])
    # fill null rows with 0 because no allocation info
    flow_subset_mapped['FlowAmountRatio'] = \
        flow_subset_mapped['FlowAmountRatio'].fillna(0)

    # drop rows where there is no allocation data
    fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'],
                                    how='all').reset_index()

    # calculate flow amounts for each sector
    log.info("Calculating new flow amounts using flow ratios")
    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

    # drop columns
    log.info("Cleaning up new flow by sector")
    fbs = fbs.drop(columns=[
        'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
        'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y'
    ])
    return fbs

예제 #5

파일 보기

def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons or run locally
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = \
        load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                           df_year=attr['helper_source_year'],
                           flowclass=attr['helper_source_class'],
                           geoscale_from=attr['helper_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = \
        helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = \
        helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = \
        helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = \
        df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = \
        df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base '
                  'merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors,
    # depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and \
            (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = \
            df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                how='left',
                left_on=['Location_tmp', sector_col_to_merge],
                right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = \
            modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = \
            df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                              how='left',
                              left_on=[sector_col_to_merge],
                              right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location', 'Sector', 'HelperFlow']],
                left_on=['Location', sector_col_to_merge],
                right_on=['Location', 'Sector'],
                how='left')
        # load bea codes that sub for naics
        bea = return_bea_codes_used_as_naics()
        # replace sector column and helperflow value if the sector column to
        # merge is in the bea list to prevent dropped data
        modified_fba_allocation['Sector'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation[sector_col_to_merge],
                     modified_fba_allocation['Sector'])
        modified_fba_allocation['HelperFlow'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation['FlowAmount'],
                     modified_fba_allocation['HelperFlow'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = \
            replacement_values.rename(
                columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = \
            modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0,
        # so after multiplying, don't have incorrect value associated with
        # new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(
                modified_fba_allocation, sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                Denominator=modified_fba_allocation.groupby(
                    ['FlowName', 'ActivityConsumedBy', 'Location',
                     'disaggregate_flag'])['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                FlowAmount=modified_fba_allocation['FlowAmount'] *
                           modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                         'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = \
            sector_aggregation(modified_fba_allocation,
                               fba_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[
            modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] ==
                                'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(
                attr['allocation_source'], attr["scale_helper_results"])(
                modified_fba_allocation, attr,
                download_FBA_if_missing=download_FBA_if_missing)
    return modified_fba_allocation

예제 #6

파일 보기

def convert_statcan_data_to_US_water_use(df, attr):
    """
    Use Canadian GDP data to convert 3 digit canadian water use to us water
    use:
    - canadian gdp
    - us gdp
    :param df: df, FBA format
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, FBA format, flowamounts converted
    """

    # load Canadian GDP data
    gdp = load_fba_w_standardized_units(datasource='StatCan_GDP',
                                        year=attr['allocation_source_year'],
                                        flowclass='Money')

    # drop 31-33
    gdp = gdp[gdp['ActivityProducedBy'] != '31-33']
    gdp = gdp.rename(columns={"FlowAmount": "CanDollar"})

    # check units before merge
    compare_df_units(df, gdp)
    # merge df
    df_m = pd.merge(df,
                    gdp[['CanDollar', 'ActivityProducedBy']],
                    how='left',
                    left_on='ActivityConsumedBy',
                    right_on='ActivityProducedBy')
    df_m['CanDollar'] = df_m['CanDollar'].fillna(0)
    df_m = df_m.drop(columns=["ActivityProducedBy_y"])
    df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"})
    df_m = df_m[df_m['CanDollar'] != 0]

    exchange_rate = get_Canadian_to_USD_exchange_rate(
        str(attr['allocation_source_year']))
    exchange_rate = float(exchange_rate)
    # convert to mgal/USD
    df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] /
                                                      exchange_rate)
    df_m.loc[:, 'Unit'] = 'Mgal/USD'

    df_m = df_m.drop(columns=["CanDollar"])

    # convert Location to US
    df_m.loc[:, 'Location'] = US_FIPS
    df_m = assign_fips_location_system(df_m,
                                       str(attr['allocation_source_year']))

    # load us gdp
    # load Canadian GDP data
    us_gdp_load = load_fba_w_standardized_units(
        datasource='BEA_GDP_GrossOutput',
        year=attr['allocation_source_year'],
        flowclass='Money')

    # load bea crosswalk
    cw_load = load_bea_crosswalk()
    cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates()
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True)

    # merge
    us_gdp = pd.merge(us_gdp_load,
                      cw,
                      how='left',
                      left_on='ActivityProducedBy',
                      right_on='BEA_2012_Detail_Code')
    us_gdp = us_gdp.drop(
        columns=['ActivityProducedBy', 'BEA_2012_Detail_Code'])
    # rename columns
    us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'})
    # agg by naics
    us_gdp = aggregator(us_gdp, fba_default_grouping_fields)
    us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'})

    # determine annual us water use
    df_m2 = pd.merge(df_m,
                     us_gdp[['ActivityProducedBy', 'us_gdp']],
                     how='left',
                     left_on='ActivityConsumedBy',
                     right_on='ActivityProducedBy')

    df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp'])
    df_m2.loc[:, 'Unit'] = 'Mgal'
    df_m2 = df_m2.rename(
        columns={'ActivityProducedBy_x': 'ActivityProducedBy'})
    df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp'])

    return df_m2

예제 #7

파일 보기

def convert_blackhurst_data_to_kg_per_employee(df_wsec, attr, method,
                                               **kwargs):
    """
    Load BLS employment data and use to transform original units to
    gallons per employee
    :param df_wsec: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :return: df, transformed fba dataframe with sector columns
    """

    # load 2002 employment data
    bls = load_fba_w_standardized_units(
        datasource='BLS_QCEW',
        year='2002',
        flowclass='Employment',
        geographic_level='national',
        download_FBA_if_missing=kwargs['download_FBA_if_missing'])

    # clean df
    bls = clean_bls_qcew_fba(bls, attr=attr)

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={
        'SectorProducedBy': 'Sector',
        'FlowAmount': 'HelperFlow'
    })

    # check units before merge
    compare_df_units(df_wsec, bls_wsec)
    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Location', 'Sector', 'HelperFlow']],
                  how='left',
                  left_on=['Location', 'SectorConsumedBy'],
                  right_on=['Location', 'Sector'])
    # drop any rows where sector is None
    df = df[~df['Sector'].isnull()]
    # fill helperflow values with 0
    df['HelperFlow'] = df['HelperFlow'].fillna(0)

    # calculate proportional ratios
    df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector')

    df_wratio = df_wratio.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'HelperFlow': 'Employees'
    })

    # drop rows where helperflow = 0
    df_wratio = df_wratio[df_wratio['Employees'] != 0]

    # calculate gal/employee in 2002
    df_wratio.loc[:, 'FlowAmount'] = \
        (df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / \
        df_wratio['Employees']
    df_wratio.loc[:, 'Unit'] = 'kg/p'

    # drop cols
    df_wratio = df_wratio.drop(
        columns=['Sector', 'Employees', 'EmployeeRatio'])

    return df_wratio

예제 #8

파일 보기

def allocate_usda_ers_mlu_land_in_rural_transportation_areas(
        df, attr, fbs_list):
    """
    This function is used to allocate the USDA_ERS_MLU activity
    'land in urban areas' to NAICS 2012 sectors. Allocation
    is dependent on assumptions defined in 'literature_values.py'.

    Methodology is based on the manuscript:
    Lin Zeng and Anu Ramaswami
    Impact of Locational Choices and Consumer Behaviors on Personal Land Footprints:
    An Exploration Across the Urban–Rural Continuum in the United States
    Environmental Science & Technology 2020 54 (6), 3091-3102
    DOI: 10.1021/acs.est.9b06024

    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior
                     to the activity set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # define sector column to base calculations
    sector_col = 'SectorConsumedBy'

    # load the federal highway administration fees dictionary
    fha_dict = get_transportation_sectors_based_on_FHA_fees()
    df_fha = pd.DataFrame.from_dict(
        fha_dict,
        orient='index').rename(columns={'NAICS_2012_Code': sector_col})

    # make an assumption about the percent of rural transport area used by airports
    airport_multiplier = get_urban_land_use_for_airports()
    df_airport = df[df[sector_col] == '488119']
    df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] *
                                   airport_multiplier)

    # make an assumption about the percent of urban transport area used by railroads
    railroad_multiplier = get_urban_land_use_for_railroads()
    df_railroad = df[df[sector_col] == '482112']
    df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] *
                                     railroad_multiplier)

    # further allocate the remaining urban transportation area
    # using Federal Highway Administration fees
    # first subtract area for airports and railroads
    air_rail_area = pd.concat([df_airport, df_railroad], sort=False)
    air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']]
    air_rail_area_sum = air_rail_area.groupby(['Location', 'Unit'], as_index=False)\
        .agg({'FlowAmount': sum}).rename(columns={'FlowAmount': 'AirRail'})

    # compare units
    compare_df_units(df, air_rail_area)
    df_highway = df.merge(air_rail_area_sum, how='left')
    df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] -
                                   df_highway['AirRail'])
    df_highway.drop(columns=['AirRail'], inplace=True)

    # add fed highway administration fees
    df_highway2 = df_highway.merge(df_fha, how='left')
    df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()]
    df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] *
                                     df_highway2['ShareOfFees'])
    df_highway2.drop(columns=['ShareOfFees'], inplace=True)

    # concat airport, railroad, highway
    allocated_rural_trans = pd.concat([df_airport, df_railroad, df_highway2],
                                      sort=False,
                                      ignore_index=True)

    return allocated_rural_trans

예제 #9

파일 보기

def allocate_usda_ers_mlu_land_in_urban_areas(df, attr, fbs_list):
    """
    This function is used to allocate the USDA_ERS_MLU activity 'land in urban areas'
    to NAICS 2012 sectors. Allocation is dependent on assumptions defined in
    'literature_values.py' as well as results from allocating
    'EIA_CBECS_Land' and 'EIA_MECS_Land' to land based sectors.

    Methodology is based on the manuscript:
    Lin Zeng and Anu Ramaswami
    Impact of Locational Choices and Consumer Behaviors on Personal Land Footprints:
    An Exploration Across the Urban–Rural Continuum in the United States
    Environmental Science & Technology 2020 54 (6), 3091-3102
    DOI: 10.1021/acs.est.9b06024

    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior
                     to the activity set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # define sector column to base calculations
    sector_col = 'SectorConsumedBy'

    vLogDetailed.info('Assuming total land use from MECS and CBECS included '
                      'in urban land area, so subtracting out calculated '
                      'MECS and CBECS land from MLU urban land area')
    # read in the cbecs and mecs df from df_list
    for df_i in fbs_list:
        if (df_i['MetaSources'] == 'EIA_CBECS_Land').all():
            cbecs = df_i
        elif (df_i['MetaSources'] == 'EIA_MECS_Land').all():
            mecs = df_i

    # load the federal highway administration fees dictionary
    fha_dict = get_transportation_sectors_based_on_FHA_fees()
    df_fha = pd.DataFrame.from_dict(
        fha_dict,
        orient='index').rename(columns={'NAICS_2012_Code': sector_col})

    # calculate total residential area from the American Housing Survey
    residential_land_area = get_area_of_urban_land_occupied_by_houses_2013()
    df_residential = df[df[sector_col] == 'F01000']
    df_residential = df_residential.assign(FlowAmount=residential_land_area)

    # make an assumption about the percent of urban area that is open space
    openspace_multiplier = get_open_space_fraction_of_urban_area()
    df_openspace = df[df[sector_col] == '712190']
    df_openspace = df_openspace.assign(FlowAmount=df_openspace['FlowAmount'] *
                                       openspace_multiplier)

    # sum all uses of urban area that are NOT transportation
    # first concat dfs for residential, openspace, commercial, and manufacturing land use
    df_non_urban_transport_area = pd.concat(
        [df_residential, df_openspace, cbecs, mecs],
        sort=False,
        ignore_index=True)
    df_non_urban_transport_area = df_non_urban_transport_area[[
        'Location', 'Unit', 'FlowAmount'
    ]]
    non_urban_transport_area_sum =\
        df_non_urban_transport_area.groupby(
            ['Location', 'Unit'], as_index=False).agg({'FlowAmount': sum}).rename(
            columns={'FlowAmount': 'NonTransport'})
    # compare units
    compare_df_units(df, df_non_urban_transport_area)
    # calculate total urban transportation by subtracting calculated areas from total urban land
    df_transport = df.merge(non_urban_transport_area_sum, how='left')
    df_transport = df_transport.assign(FlowAmount=df_transport['FlowAmount'] -
                                       df_transport['NonTransport'])
    df_transport.drop(columns=['NonTransport'], inplace=True)

    # make an assumption about the percent of urban transport area used by airports
    airport_multiplier = get_urban_land_use_for_airports()
    df_airport = df_transport[df_transport[sector_col] == '488119']
    df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] *
                                   airport_multiplier)

    # make an assumption about the percent of urban transport area used by railroads
    railroad_multiplier = get_urban_land_use_for_railroads()
    df_railroad = df_transport[df_transport[sector_col] == '482112']
    df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] *
                                     railroad_multiplier)

    # further allocate the remaining urban transportation area using
    # Federal Highway Administration fees
    # first subtract area for airports and railroads
    air_rail_area = pd.concat([df_airport, df_railroad], sort=False)
    air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']]
    air_rail_area_sum = air_rail_area.groupby(['Location', 'Unit'], as_index=False)\
        .agg({'FlowAmount': sum}).rename(columns={'FlowAmount': 'AirRail'})

    df_highway = df_transport.merge(air_rail_area_sum, how='left')
    df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] -
                                   df_highway['AirRail'])
    df_highway.drop(columns=['AirRail'], inplace=True)

    # add fed highway administration fees
    df_highway2 = df_highway.merge(df_fha, how='left')
    df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()]
    df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] *
                                     df_highway2['ShareOfFees'])
    df_highway2.drop(columns=['ShareOfFees'], inplace=True)

    # concat all df subsets
    allocated_urban_areas_df = pd.concat(
        [df_residential, df_openspace, df_airport, df_railroad, df_highway2],
        ignore_index=True,
        sort=False).reset_index(drop=True)

    return allocated_urban_areas_df

예제 #10

파일 보기

def check_golf_and_crop_irrigation_totals(df_load):
    """
    Check that golf + crop values equal published irrigation totals.
    If not, assign water to crop irrigation.
    :param df_load: df, USGS water use
    :return: df, FBA with reassigned irrigation water to crop and golf
    """

    # drop national data
    df = df_load[df_load['Location'] != '00000']

    # subset into golf, crop, and total irrigation (and non irrigation)
    df_i = df[(df[fba_activity_fields[0]] == 'Irrigation') |
              (df[fba_activity_fields[1]] == 'Irrigation')]
    df_g = df[(df[fba_activity_fields[0]] == 'Irrigation Golf Courses') |
              (df[fba_activity_fields[1]] == 'Irrigation Golf Courses')]
    df_c = df[(df[fba_activity_fields[0]] == 'Irrigation Crop') |
              (df[fba_activity_fields[1]] == 'Irrigation Crop')]

    # unit check
    compare_df_units(df_i, df_g)
    # merge the golf and total irrigation into crop df and
    # modify crop FlowAmounts if necessary
    df_m = pd.merge(df_i,
                    df_g[[
                        'FlowName', 'FlowAmount', 'ActivityProducedBy',
                        'ActivityConsumedBy', 'Compartment', 'Location', 'Year'
                    ]],
                    how='outer',
                    right_on=['FlowName', 'Compartment', 'Location', 'Year'],
                    left_on=['FlowName', 'Compartment', 'Location', 'Year'])
    df_m = df_m.rename(
        columns={
            "FlowAmount_x": "FlowAmount",
            "ActivityProducedBy_x": "ActivityProducedBy",
            "ActivityConsumedBy_x": "ActivityConsumedBy",
            "FlowAmount_y": "Golf_Amount",
            "ActivityProducedBy_y": "Golf_APB",
            "ActivityConsumedBy_y": "Golf_ACB",
        })
    compare_df_units(df_m, df_c)
    df_m2 = pd.merge(df_m,
                     df_c[[
                         'FlowName', 'FlowAmount', 'ActivityProducedBy',
                         'ActivityConsumedBy', 'Compartment', 'Location',
                         'Year'
                     ]],
                     how='outer',
                     right_on=['FlowName', 'Compartment', 'Location', 'Year'],
                     left_on=['FlowName', 'Compartment', 'Location', 'Year'])
    df_m2 = df_m2.rename(
        columns={
            "FlowAmount_x": "FlowAmount",
            "ActivityProducedBy_x": "ActivityProducedBy",
            "ActivityConsumedBy_x": "ActivityConsumedBy",
            "FlowAmount_y": "Crop_Amount",
            "ActivityProducedBy_y": "Crop_APB",
            "ActivityConsumedBy_y": "Crop_ACB"
        })
    # fill na and sum crop and golf
    # df_m2 = df_m2.fillna(0)
    df_m2['subset_sum'] = df_m2['Crop_Amount'] + df_m2['Golf_Amount']
    df_m2['Diff'] = df_m2['FlowAmount'] - df_m2['subset_sum']

    df_m3 = df_m2[df_m2['Diff'] >= 0.000001].reset_index(drop=True)

    # rename irrigation to irrgation crop and append rows to df
    df_m3.loc[df_m3['ActivityProducedBy'] == 'Irrigation',
              'ActivityProducedBy'] = 'Irrigation Crop'
    df_m3.loc[df_m3['ActivityConsumedBy'] == 'Irrigation',
              'ActivityConsumedBy'] = 'Irrigation Crop'
    df_m3 = df_m3.drop(columns=[
        'Golf_Amount', 'Golf_APB', 'Golf_ACB', 'Crop_Amount', 'Crop_APB',
        'Crop_ACB', 'subset_sum', 'Diff'
    ])

    if len(df_m3) != 0:
        df_w_missing_crop = df_load.append(df_m3, sort=True, ignore_index=True)
        return df_w_missing_crop
    else:
        return df_load

예제 #11

파일 보기

def calculate_net_public_supply(df_load):
    """
    USGS Provides info on the quantity of public supply withdrawals that
    are delivered to domestic use. The USGS PS withdrawals are not necessarily
    greater than/equal to the Domestic deliveries because water can be
    withdrawn in one county and delivered in another (water can also cross
    state lines). Therefore, can/do end up with NEGATIVE net public supply
    values and PS water should only be used at a national level

    Domestic deliveries are subtracted from public supply. An assumption is
    made that PS deliveries to domestic is fresh water. The national level
    data can then be allocated to end users using the BEA Use tables.
    :param df_load: USGS df
    :return: df with net public supply values
    """

    # subset into 2 dfs, one that contains PS data and one that does not
    df1 = df_load[(df_load[fba_activity_fields[0]] == 'Public Supply') |
                  (df_load[fba_activity_fields[1]] == 'Public Supply')]
    df2 = df_load[(df_load[fba_activity_fields[0]] != 'Public Supply')
                  & (df_load[fba_activity_fields[1]] != 'Public Supply')]

    # drop all deliveries to thermo and industrial
    # (not enough states report the data to make usable)
    df1_sub = df1[~df1[fba_activity_fields[1]].isin([
        'Industrial', 'Thermoelectric Power',
        'Thermoelectric Power Closed-loop cooling',
        'Thermoelectric Power Once-through cooling'
    ])]
    # drop duplicate info of "Public Supply deliveries to"
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("Public Supply total deliveries")]
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("deliveries from public supply")]

    # calculate data drop
    vLogDetailed.info('Dropping rows that contain "deliveries from public '
                      'supply" to avoid double counting with rows of "Public '
                      'Supply deliveries to"')
    calculate_flowamount_diff_between_dfs(df1, df1_sub)

    # drop county level values because cannot use county data
    vLogDetailed.info('Dropping county level public supply withdrawals '
                      'because will end up with negative values due to '
                      'instances of water deliveries coming from surrounding '
                      'counties')
    df1_sub = df1_sub[df1_sub['Location'].apply(
        lambda x: x[2:6] == '000')].reset_index(drop=True)

    # df of ps delivered and ps withdrawn and us total
    df_d = df1_sub[df1_sub[fba_activity_fields[0]] == 'Public Supply']
    df_w = df1_sub[df1_sub[fba_activity_fields[1]] == 'Public Supply']
    df_us = df1_sub[df1_sub['Location'] == '00000']
    # split consumed further into fresh water (assumption domestic
    # deliveries are freshwater) assumption that water withdrawal taken
    # equally from ground and surface
    df_w1 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] != 'total')]
    df_w2 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] == 'total')]
    # compare units
    compare_df_units(df_w1, df_w2)
    df_wm = pd.merge(df_w1,
                     df_w2[['FlowAmount', 'Location', 'Unit']],
                     how='left',
                     left_on=['Location', 'Unit'],
                     right_on=['Location', 'Unit'])
    df_wm = df_wm.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "FlowTotal"
    })
    # compare units
    compare_df_units(df_wm, df_d)
    # merge the deliveries to domestic
    df_w_modified = pd.merge(df_wm,
                             df_d[['FlowAmount', 'Location']],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_w_modified = df_w_modified.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "DomesticDeliveries"
    })

    # create flowratio for ground/surface
    df_w_modified.loc[:, 'FlowRatio'] = \
        df_w_modified['FlowAmount'] / df_w_modified['FlowTotal']
    # calculate new, net total public supply withdrawals
    # will end up with negative values due to instances of water
    # deliveries coming form surrounding counties
    df_w_modified.loc[:, 'FlowAmount'] = \
        df_w_modified['FlowAmount'] - (df_w_modified['FlowRatio'] *
                                       df_w_modified['DomesticDeliveries'])

    net_ps = df_w_modified.drop(columns=["FlowTotal", "DomesticDeliveries"])

    # compare units
    compare_df_units(df_d, net_ps)
    # because assuming domestic is all fresh, drop
    # flowname/flowable/Compartment/context
    # and instead use those column data from the net_ps df
    df_d_modified = df_d.drop(
        columns=['FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID'])
    # Also allocate to ground/surface from state ratios
    df_d_modified = pd.merge(df_d_modified,
                             net_ps[[
                                 'FlowName', 'Flowable', 'Compartment',
                                 'Context', 'FlowUUID', 'Location', 'FlowRatio'
                             ]],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_d_modified.loc[:, 'FlowAmount'] = \
        df_d_modified['FlowAmount'] * df_d_modified['FlowRatio']
    df_d_modified = df_d_modified.drop(columns=["FlowRatio"])

    net_ps = net_ps.drop(columns=["FlowRatio"])

    # concat dfs back (non-public supply, public supply
    # deliveries, net ps withdrawals)
    modified_ps = pd.concat([df2, df_d_modified, net_ps, df_us],
                            ignore_index=True)

    return modified_ps