示例#1
0
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method):

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \
        sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale
    from flowsa.BLS_QCEW import clean_bls_qcew_fba

    bls = flowsa.getFlowByActivity(flowclass=['Employment'],
                                   datasource='BLS_QCEW',
                                   years=[2002])
    # clean df
    bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict)
    bls = clean_bls_qcew_fba(bls, attr)

    # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields)
    bls_agg = filter_by_geoscale(bls, 'national')

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls_agg, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'})

    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
    # aren't in list
    sector_list = df_wsec['Sector'].unique().tolist()
    # subset fba allocation table to the values in the activity list, based on overlapping sectors
    bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)]
    # calculate proportional ratios
    bls_wsec = proportional_allocation_by_location_and_sector(
        bls_wsec, 'Sector')  #, 'agg')
    bls_wsec = bls_wsec.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'FlowAmount': 'Employees'
    })

    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Sector', 'EmployeeRatio', 'Employees']],
                  how='left',
                  left_on='Sector',
                  right_on='Sector')
    df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0)
    # calculate gal/employee in 2002
    df.loc[:, 'FlowAmount'] = (df['FlowAmount'] *
                               df['EmployeeRatio']) / df['Employees']
    df.loc[:, 'Unit'] = 'gal/employee'

    # drop cols
    df = df.drop(columns=['Employees', 'EmployeeRatio'])

    return df
示例#2
0
def allocation_helper(df_w_sector, method, attr, v):
    """
    Used when two df required to create allocation ratio
    :param df_w_sector:
    :param method: currently written for 'multiplication' and 'proportional'
    :param attr:
    :return:
    """

    from flowsa.Blackhurst_IO import scale_blackhurst_results_to_usgs_values
    from flowsa.BLS_QCEW import clean_bls_qcew_fba, bls_clean_allocation_fba_w_sec
    from flowsa.mapping import add_sectors_to_flowbyactivity

    helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']],
                                                 datasource=attr['helper_source'],
                                                 years=[attr['helper_source_year']])
    if 'clean_helper_fba' in attr:
        log.info("Cleaning " + attr['helper_source'] + ' FBA')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            helper_allocation = clean_bls_qcew_fba(helper_allocation, attr)
            # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba"])(helper_allocation, attr)
    # clean df
    helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict)
    # drop rows with flowamount = 0
    helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0]

    # agg data if necessary or filter
    # determine to scale
    to_scale = min(fips_number_key[attr['allocation_from_scale']], fips_number_key[v['geoscale_to_use']])
    if fips_number_key[attr['helper_from_scale']] > to_scale:
        helper_allocation = agg_by_geoscale(helper_allocation,
                                            attr['helper_from_scale'],
                                            list(fips_number_key.keys())[list(fips_number_key.values()).index(to_scale)],
                                            fba_default_grouping_fields)
    else:
        helper_allocation = filter_by_geoscale(helper_allocation, attr['helper_from_scale'])

    # assign naics to allocation dataset
    helper_allocation = add_sectors_to_flowbyactivity(helper_allocation,
                                                      sectorsourcename=method['target_sector_source'])

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = generalize_activity_field_names(helper_allocation)
    # clean up helper fba with sec
    if 'clean_helper_fba_wsec' in attr:
        log.info("Cleaning " + attr['helper_source'] + ' FBA with sectors')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            helper_allocation = bls_clean_allocation_fba_w_sec(helper_allocation, attr, method)
            # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba_wsec"])(helper_allocation, attr, method)
    # drop columns
    helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    if attr['helper_method'] == 'proportional':
        # if calculating proportion, first subset the helper allocation df to only contain relevant sectors
        # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
        # aren't in list
        sector_list = df_w_sector['Sector'].unique().tolist()
        # subset fba allocation table to the values in the activity list, based on overlapping sectors
        helper_allocation = helper_allocation.loc[helper_allocation['Sector'].isin(sector_list)]
        # calculate proportional ratios
        helper_allocation = proportional_allocation_by_location_and_sector(helper_allocation, 'Sector')

    # rename column
    helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})
    merge_columns = [e for e in ['Location','Sector', 'HelperFlow', 'FlowAmountRatio'] if e in
                     helper_allocation.columns.values.tolist()]

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2])
        merge_columns.append('Location_tmp')
        modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left')
        modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp'])
    else:
        modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left')

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # todo: modify so if missing data, replaced with value from one geoscale up instead of national
        # todo: modify year after merge if necessary
        # if missing values (na or 0), replace with national level values
        replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(
            drop=True)
        replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'})
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0,
                                                                modified_fba_allocation['ReplacementValue'],
                                                                modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated
        # with new unit
        modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation['FlowAmountRatio'] = modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'FlowAmountRatio'])

    # drop rows of 0
    modified_fba_allocation = modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    # todo: change units
    modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling " + attr['helper_source'] + ' to FBA values')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            modified_fba_allocation = scale_blackhurst_results_to_usgs_values(modified_fba_allocation, attr)
            # modified_fba_allocation = getattr(sys.modules[__name__], attr["scale_helper_results"])(modified_fba_allocation, attr)

    return modified_fba_allocation
示例#3
0
def disaggregate_cropland(fba_w_sector, attr):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3]))
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(
        flowclass=['Land'],
        years=[attr['allocation_source_year']],
        datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics,
                                          sectorsourcename='NAICS_2012_Code',
                                          levelofSectoragg='agg')
    # add missing fbs fields
    naics = add_missing_flow_by_fields(naics, flow_by_sector_fields)

    # aggregate sectors to create any missing naics levels
    naics = sector_aggregation(naics, fbs_default_grouping_fields)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics = sector_disaggregation(naics)
    # drop rows where sector consumed by is none and FlowAmount 0
    naics = naics[naics['SectorConsumedBy'].notnull()]
    naics = naics.loc[naics['FlowAmount'] != 0]
    # create ratios
    naics = sector_ratios(naics)
    # drop sectors < 4 digits
    #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create temporary sector column to match the two dfs on
    naics.loc[:,
              'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2]))

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop['Sector'].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply(
            lambda x: x[0:i])
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics.loc[naics['SectorConsumedBy'].apply(
            lambda x: len(x) == i + 1)].reset_index(drop=True)
        naics_subset.loc[:, 'Sector_tmp'] = naics_subset[
            'SectorConsumedBy'].apply(lambda x: x[0:i])
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 'SectorConsumedBy', 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=['Sector', 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=['Sector', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"})
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply(
        lambda x: str(x[0:3])) == '112'].reset_index(drop=True)

    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    return fba_w_sector
示例#4
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__],
                                v["clean_fba_df_fxn"])(flows)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                log.info("Preparing to handle subset of flownames " +
                         ', '.join(map(str, names)) + " in " + k)
                # subset fba data by activity
                flows_subset = flows[
                    (flows[fba_activity_fields[0]].isin(names)) |
                    (flows[fba_activity_fields[1]].isin(names))].reset_index(
                        drop=True)

                # extract relevant geoscale data or aggregate existing data
                log.info("Subsetting/aggregating dataframe to " +
                         attr['allocation_from_scale'] + " geoscale")
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset, v['geoscale_to_use'],
                    attr['allocation_from_scale'])

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to " + k)
                flow_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__],
                                               v["clean_fba_w_sec_df_fxn"])(
                                                   flow_subset_wsec, attr=attr)

                # map df to elementary flows
                log.info("Mapping flows in " + k +
                         ' to federal elementary flow list')
                if 'fedefl_mapping' in v:
                    mapping_files = v['fedefl_mapping']
                else:
                    mapping_files = k

                flow_subset_mapped = map_elementary_flows(
                    flow_subset_wsec, mapping_files)

                # clean up mapped fba with sectors, if specified in yaml
                if "clean_mapped_fba_w_sec_df_fxn" in v:
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_mapped = getattr(
                        sys.modules[__name__],
                        v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped,
                                                            attr, method)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' +
                             ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_mapped.copy()
                    # for each activity, if activities are not sector like, check that there is no data loss
                    if load_source_catalog(
                    )[k]['sector-like_activities'] is False:
                        activity_list = []
                        for n in names:
                            log.info('Checking for ' + n + ' at ' +
                                     method['target_sector_level'])
                            fbs_subset = fbs[(
                                (fbs[fba_activity_fields[0]] == n) &
                                (fbs[fba_activity_fields[1]] == n)) |
                                             (fbs[fba_activity_fields[0]] == n)
                                             |
                                             (fbs[fba_activity_fields[1]] == n
                                              )].reset_index(drop=True)
                            fbs_subset = check_if_losing_sector_data(
                                fbs_subset, method['target_sector_level'])
                            activity_list.append(fbs_subset)
                        fbs = pd.concat(activity_list, ignore_index=True)

                # if allocation method for an activity set requires a specific function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    log.info(
                        'Calling on function specified in method yaml to allocate '
                        + ', '.join(map(str, names)) + ' to sectors')
                    fbs = getattr(sys.modules[__name__],
                                  attr['allocation_source'])(
                                      flow_subset_mapped, attr, fbs_list)

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " +
                             attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(
                        flowclass=[attr['allocation_source_class']],
                        datasource=attr['allocation_source'],
                        years=[attr['allocation_source_year']
                               ]).reset_index(drop=True)

                    # clean df and harmonize unites
                    fba_allocation = clean_df(fba_allocation,
                                              flow_by_activity_fields,
                                              fba_fill_na_dict)
                    fba_allocation = harmonize_units(fba_allocation)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " +
                             attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(
                        fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    fba_allocation = subset_df_by_geoscale(
                        fba_allocation, attr['allocation_from_scale'],
                        v['geoscale_to_use'])

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['FlowName'].isin(
                                attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(
                                attr['allocation_compartment'])]

                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(
                                                     fba_allocation, attr=attr)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation_wsec = add_sectors_to_flowbyactivity(
                        fba_allocation,
                        sectorsourcename=method['target_sector_source'])

                    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
                    if 'clean_allocation_fba_w_sec' in attr:
                        log.info("Further disaggregating sectors in " +
                                 attr['allocation_source'])
                        fba_allocation_wsec = getattr(
                            sys.modules[__name__],
                            attr["clean_allocation_fba_w_sec"])(
                                fba_allocation_wsec, attr=attr, method=method)

                    # subset fba datasets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] +
                             " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(
                        fba_allocation_wsec,
                        k,
                        names,
                        flowSubsetMapped=flow_subset_mapped,
                        allocMethod=attr['allocation_method'])

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info(
                            "Using the specified allocation help for subset of "
                            + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(
                            fba_allocation_subset, attr, method, v)

                    # create flow allocation ratios for each activity
                    # if load_source_catalog()[k]['sector-like_activities']
                    flow_alloc_list = []
                    group_cols = fba_mapped_default_grouping_fields
                    group_cols = [
                        e for e in group_cols
                        if e not in ('ActivityProducedBy',
                                     'ActivityConsumedBy')
                    ]
                    for n in names:
                        log.info("Creating allocation ratios for " + n)
                        fba_allocation_subset_2 = get_fba_allocation_subset(
                            fba_allocation_subset,
                            k, [n],
                            flowSubsetMapped=flow_subset_mapped,
                            allocMethod=attr['allocation_method'])
                        if len(fba_allocation_subset_2) == 0:
                            log.info("No data found to allocate " + n)
                        else:
                            flow_alloc = allocate_by_sector(
                                fba_allocation_subset_2,
                                k,
                                attr['allocation_source'],
                                attr['allocation_method'],
                                group_cols,
                                flowSubsetMapped=flow_subset_mapped)
                            flow_alloc = flow_alloc.assign(FBA_Activity=n)
                            flow_alloc_list.append(flow_alloc)
                    flow_allocation = pd.concat(flow_alloc_list,
                                                ignore_index=True)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " +
                             attr['allocation_source'])
                    flow_allocation = collapse_activity_fields(flow_allocation)

                    # check for issues with allocation ratios
                    check_allocation_ratios(flow_allocation, aset, k,
                                            method_name)

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_mapped = flow_subset_mapped.loc[
                        (flow_subset_mapped[fbs_activity_fields[0]].
                         isin(sector_list)) |
                        (flow_subset_mapped[fbs_activity_fields[1]].
                         isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info(
                        "Checking if flowbyactivity and allocation dataframes use the same location systems"
                    )
                    check_if_location_systems_match(flow_subset_mapped,
                                                    flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " +
                             attr['allocation_source'])
                    fbs = flow_subset_mapped.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorProducedBy',
                            'ActivityProducedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    fbs = fbs.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorConsumedBy',
                            'ActivityConsumedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs[
                        'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                    # fill null rows with 0 because no allocation info
                    fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'],
                                     how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[
                        'FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=[
                        'Sector_x', 'FlowAmountRatio_x', 'Sector_y',
                        'FlowAmountRatio_y', 'FlowAmountRatio',
                        'FBA_Activity_x', 'FBA_Activity_y'
                    ])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                # todo: replace with fxn return_from_scale
                log.info("Aggregating flowbysector to " +
                         method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[
                        attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale,
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict)

                # check if any sector information is lost before reaching the target sector length, if so,
                # allocate values equally to disaggregated sectors
                log.info('Checking for data at ' +
                         method['target_sector_level'])
                fbs_agg_2 = check_if_losing_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                # todo: modify fxn to work if activities are sector like in df being allocated
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    check_for_differences_between_fba_load_and_fbs_output(
                        flow_subset_mapped, fbs_agg_2, aset, k, method_name)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index(
                        drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull())
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_load_and_fbs_output_totals(flows_subset_geo,
                                                       fbs_sector_subset, aset,
                                                       k, method_name, attr,
                                                       method, mapping_files)

                log.info(
                    "Completed flowbysector for activity subset with flows " +
                    ', '.join(map(str, names)))
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # add missing fields, ensure correct data type, reorder columns
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
示例#5
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows)

            flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                names = attr['names']
                log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)

                # check if flowbyactivity data exists at specified geoscale to use
                flow_subset_list = []
                for n in names:
                    # subset usgs data by activity
                    flow_subset = flows[(flows[fba_activity_fields[0]] == n) |
                                        (flows[fba_activity_fields[1]] == n)].reset_index(drop=True)
                    log.info("Checking if flowbyactivity data exists for " + n + " at the " +
                             v['geoscale_to_use'] + ' level')
                    geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n)
                    # aggregate geographically to the scale of the allocation dataset
                    if geocheck == "Yes":
                        activity_from_scale = v['geoscale_to_use']
                    else:
                        # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                        # geoscale, and sum to specified geoscale
                        log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level")
                        activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset,
                                                                                               v['geoscale_to_use'], n)

                    activity_to_scale = attr['allocation_from_scale']
                    # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale
                    if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale,
                                                      fba_default_grouping_fields, n)
                    # else, aggregate to geoscale want to use
                    elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'],
                                                      fba_default_grouping_fields, n)
                    # else, if usgs is more aggregated than allocation table, filter relevant rows
                    else:
                        log.info("Subsetting " + activity_from_scale + " data")
                        flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n)

                    # Add sectors to df activity, depending on level of specified sector aggregation
                    log.info("Adding sectors to " + k + " for " + n)
                    flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                                     sectorsourcename=method['target_sector_source'],
                                                                     levelofSectoragg=attr['activity_sector_aggregation'])
                    flow_subset_list.append(flow_subset_wsec)
                flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True)

                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k + " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr)

                # map df to elementary flows - commented out until mapping complete
                log.info("Mapping flows in " + k + ' to federal elementary flow list')
                flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_wsec.copy()

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                              datasource=attr['allocation_source'],
                                                              years=[attr['allocation_source_year']]).reset_index(drop=True)

                    fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict)

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(fba_allocation)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    from_scale = attr['allocation_from_scale']
                    to_scale = v['geoscale_to_use']
                    # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                    if fips_number_key[from_scale] > fips_number_key[to_scale]:
                        fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale,
                                                         fba_default_grouping_fields, names)
                    # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                    else:
                        fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                                   sectorsourcename=method['target_sector_source'],
                                                                   levelofSectoragg=attr['allocation_sector_aggregation'])

                    # subset fba datsets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " + attr['allocation_source'])
                    fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                    # drop columns
                    fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                    # call on fxn to further disaggregate the fba allocation data, if exists
                    if 'allocation_disaggregation_fxn' in attr:
                        log.info("Futher disaggregating sectors in " + attr['allocation_source'])
                        fba_allocation_subset = getattr(sys.modules[__name__],
                                                        attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr)

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                    # create flow allocation ratios
                    log.info("Creating allocation ratios for " + attr['allocation_source'])
                    flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_wsec = flow_subset_wsec.loc[
                        (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                        (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                    check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                    fbs = flow_subset_wsec.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorProducedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    fbs = fbs.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorConsumedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                            'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # clean df
                fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs = sector_aggregation(fbs, fbs_default_grouping_fields)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs = sector_disaggregation(fbs)

                # test agg by sector
                # sector_agg_comparison = sector_flow_comparision(fbs)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])
                # add any non-NAICS sectors used with NAICS
                sector_list = add_non_naics_sectors(sector_list, method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) &
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False)

                # set source name
                fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source']

                log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
                fbss.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            fbss.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    fbss = fbss.replace({'nan': None})
    # add missing fields, ensure correct data type, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
示例#6
0
def allocation_helper(df_w_sector, method, attr):
    """
    Used when two df required to create allocation ratio
    :param df_w_sector:
    :param method: currently written for 'multiplication'
    :param attr:
    :return:
    """

    from flowsa.mapping import add_sectors_to_flowbyactivity

    helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']],
                                                 datasource=attr['helper_source'],
                                                 years=[attr['helper_source_year']])
    # clean df
    helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict)
    # drop rows with flowamount = 0
    helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0]

    # assign naics to allocation dataset
    helper_allocation = add_sectors_to_flowbyactivity(helper_allocation,
                                                      sectorsourcename=method[
                                                          'target_sector_source'],
                                                      levelofSectoragg=attr[
                                                          'helper_sector_aggregation'])
    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = generalize_activity_field_names(helper_allocation)
    # drop columns
    helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max'])
    # rename column
    helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if attr['helper_from_scale'] == 'national':
        modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                                                    how='left')
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'state'):
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'Location', 'HelperFlow']], how='left')
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(
            lambda x: str(x[0:2]))
        df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: str(x[0:2]))
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']],
            how='left')
        modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp'])

    # todo: modify so if missing data, replaced with value from one geoscale up instead of national
    # if missing values (na or 0), replace with national level values
    replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(
        drop=True)
    replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'})
    modified_fba_allocation = modified_fba_allocation.merge(
        replacement_values[['Sector', 'ReplacementValue']], how='left')
    modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
        modified_fba_allocation['ReplacementValue'])
    modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0,
                                                            modified_fba_allocation['ReplacementValue'],
                                                            modified_fba_allocation['HelperFlow'])
    # modify flow amounts using helper data
    if attr['helper_method'] == 'multiplication':
        # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated
        # with new unit
        modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
            value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation[
                                                           'HelperFlow']
    # drop columns
    modified_fba_allocation = modified_fba_allocation.drop(
        columns=["HelperFlow", 'ReplacementValue'])

    # drop rows of 0 to speed up allocation
    modified_fba_allocation = modified_fba_allocation[
        modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    #todo: modify the unit

    return modified_fba_allocation
示例#7
0
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\
        sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\
        harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(flowclass=['Land'],
                                          years=years_list,
                                          datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    naics = harmonize_units(naics)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source'])
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)
    # drop cols and rename
    # naics = naics.drop(columns=["SectorProducedBy"])
    # naics = naics.rename(columns={"SectorConsumedBy": sector_column})

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2, group_cols)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']],
                      how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True)
        # drop columns
        df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
示例#8
0
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: The CoA Cropland dataframe after linked to sectors
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :return: The CoA cropland dataframe with disaggregated pastureland data
    """

    import flowsa
    from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \
        fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \
        fba_mapped_default_grouping_fields, harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))
        df_sourcename = pd.unique(p['SourceName'])[0]

        # load usda coa cropland naics
        df_class = ['Land']
        df_years = years_list
        df_allocation = 'USDA_CoA_Cropland_NAICS'
        df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation)
        df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict)
        df_f = harmonize_units(df_f)
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source'])
        # create proportional ratios
        group_cols = fba_mapped_default_grouping_fields
        group_cols = [e for e in group_cols if
                      e not in ('ActivityProducedBy', 'ActivityConsumedBy')]
        df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left',
                     left_on="Location_tmp", right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio'])
        df = df.rename(columns={sector_column + '_y': sector_column,
                                "Location_x": 'Location'})

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
示例#9
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
示例#10
0
def iteratively_determine_flows_requiring_disaggregation(
        df_load, attr, method):
    """
    The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW employment data to determine land use for
    different industries. To accurately estimate land use per industry, existing FlowAmounts for a particular NAICS
    level (NAICS6) for example, should be subtracted from the possible FlowAmounts for other NAICS6 that share the first
    5 digits. For Example, there is data for '311', '3112', and '311221' in the 2014 dataset. FlowAmounts for allocation
    by employment for NAICS6 are based on the provided '3112' FlowAmounts. However, since there is data at one NAICS6
    (311221), the FlowAmount for that NAICS6 should be subtracted from other NAICS6 to accurately depict the remaining
    'FlowAmount' that requires a secondary source (Employment data) for allocation.
    :param df_load:
    :return: A dataframe with a column 'disaggregate_flag', if '1', row requires secondary source to calculate
             FlowAmount, if '0' FlowAmount does not require modifications
    """

    from flowsa.flowbyfunctions import replace_strings_with_NoneType, replace_NoneType_with_empty_cells
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # original df - subset
    # subset cols of original df
    dfo = df_load[['FlowAmount', 'Location', 'SectorConsumedBy']]
    # add a column of the sector dropping last digit
    dfo = dfo.assign(
        SectorMatch=dfo['SectorConsumedBy'].apply(lambda x: x[:len(x) - 1]))
    # sum flowamounts based on sector match col
    dfo2 = dfo.groupby(['Location', 'SectorMatch'], as_index=False)['FlowAmount'] \
        .sum().rename(columns={'FlowAmount': 'SubtractFlow'})
    dfo2 = dfo2.assign(
        SectorLengthMatch=dfo2['SectorMatch'].apply(lambda x: len(x) + 1))

    # new df
    # in the original df, drop sector columns re-add sectors, this time with sectors = 'aggregated'
    dfn = df_load.drop(columns=[
        'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy',
        'ConsumedBySectorType', 'SectorSourceName'
    ])
    dfn = add_sectors_to_flowbyactivity(
        dfn,
        sectorsourcename=method['target_sector_source'],
        overwrite_sectorlevel='aggregated')
    # add column of sector length
    dfn = dfn.assign(
        SectorLength=dfn['SectorConsumedBy'].apply(lambda x: len(x)))
    # add column noting that these columns require an allocation ratio
    dfn = dfn.assign(disaggregate_flag=1)
    # create lists of sectors to drop
    list_original = df_load['ActivityConsumedBy'].drop_duplicates().tolist()
    # drop values in original df
    dfn2 = dfn[~dfn['SectorConsumedBy'].isin(list_original)].reset_index(
        drop=True)
    # sort the df by 'ActivityConsumedBy' and drop duplicated rows of SectorconsumedBy, keeping the second entry \
    # (where ActivityConsumedBy has greater sector length)
    dfn2 = dfn2.sort_values(['ActivityConsumedBy', 'SectorConsumedBy'])
    dfn3 = dfn2.drop_duplicates('SectorConsumedBy',
                                keep='last').reset_index(drop=True)
    # add columns on which to match
    dfn3 = dfn3.assign(NAICS3=dfn3.apply(lambda x: x['SectorConsumedBy'][
        0:3] if len(x['ActivityConsumedBy']) <= 3 else 0,
                                         axis=1))
    dfn3 = dfn3.assign(NAICS4=dfn3.apply(lambda x: x['SectorConsumedBy'][
        0:4] if len(x['ActivityConsumedBy']) <= 4 else 0,
                                         axis=1))
    dfn3 = dfn3.assign(NAICS5=dfn3.apply(lambda x: x['SectorConsumedBy'][
        0:5] if len(x['ActivityConsumedBy']) <= 5 else 0,
                                         axis=1))

    # merge the two dfs and create new flowamounts for allocation
    # first merge the new df with the subset original df where activity = sector match
    df = pd.merge(dfn3,
                  dfo2[['Location', 'SectorMatch', 'SubtractFlow']],
                  how='left',
                  left_on=['Location', 'ActivityConsumedBy'],
                  right_on=['Location', 'SectorMatch'
                            ]).rename(columns={
                                'SubtractFlow': 'SubtractFlow1'
                            }).drop(columns='SectorMatch')

    # then merge new df with subset original df a second time, this time where sector - length 1 = sector match

    def match_flows(row):
        # conditions
        # sector match != activity consumed by
        condition1 = dfo2['Location'] == row['Location']
        condition2 = dfo2['SectorLengthMatch'] <= row['SectorLength']
        condition3 = dfo2['SectorMatch'] != row['ActivityConsumedBy']
        # condition4 = dfo2['SectorMatch'] == row['SectorConsumedBy'][:len(dfo2['SectorMatch'])]
        condition4 = ((row['NAICS3'] == dfo2['SectorMatch']) |
                      (row['NAICS4'] == dfo2['SectorMatch']) |
                      (row['NAICS5'] == dfo2['SectorMatch']))
        curr_df = dfo2[condition1 & condition2 & condition3 & condition4]

        try:
            row['SubtractFlow2'] = curr_df['SubtractFlow'].iloc[0]
        except:
            row['SubtractFlow2'] = 0

        return row

    df2 = df.apply(lambda x: match_flows(x), axis=1)

    # calculate new flow amounts
    df2['SubtractFlow1'] = df2['SubtractFlow1'].fillna(0)
    df2['FlowAmount'] = df2['FlowAmount'] - df2['SubtractFlow1'] - df2[
        'SubtractFlow2']
    # drop columns
    df3 = df2.drop(columns=[
        'SectorLength', 'NAICS3', 'NAICS4', 'NAICS5', 'SubtractFlow1',
        'SubtractFlow2'
    ])

    # merge the original df with modified
    # add column to original df for disaggregate_flag
    df_load = df_load.assign(disaggregate_flag=0)

    # concat the two dfs and sort
    df_c = pd.concat([df_load, df3],
                     ignore_index=True).sort_values(['SectorConsumedBy'
                                                     ]).reset_index(drop=True)

    df_c = replace_strings_with_NoneType(df_c)

    return df_c
示例#11
0
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method):
    """

    :param df_wsec:
    :param attr:
    :param method:
    :return:
    """

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict,  proportional_allocation_by_location_and_activity, \
        filter_by_geoscale, harmonize_units
    from flowsa.BLS_QCEW import clean_bls_qcew_fba

    bls = flowsa.getFlowByActivity(flowclass=['Employment'],
                                   datasource='BLS_QCEW',
                                   years=[2002])

    bls = filter_by_geoscale(bls, 'national')

    # clean df
    bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict)
    bls = harmonize_units(bls)
    bls = clean_bls_qcew_fba(bls, attr=attr)

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={
        'SectorProducedBy': 'Sector',
        'FlowAmount': 'HelperFlow'
    })

    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Location', 'Sector', 'HelperFlow']],
                  how='left',
                  left_on=['Location', 'SectorConsumedBy'],
                  right_on=['Location', 'Sector'])
    # drop any rows where sector is None
    df = df[~df['Sector'].isnull()]
    # fill helperflow values with 0
    df['HelperFlow'] = df['HelperFlow'].fillna(0)

    # calculate proportional ratios
    df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector')

    df_wratio = df_wratio.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'HelperFlow': 'Employees'
    })

    # drop rows where helperflow = 0
    df_wratio = df_wratio[df_wratio['Employees'] != 0]

    # calculate gal/employee in 2002
    df_wratio.loc[:, 'FlowAmount'] = (
        df_wratio['FlowAmount'] *
        df_wratio['EmployeeRatio']) / df_wratio['Employees']
    df_wratio.loc[:, 'Unit'] = 'gal/employee'

    # drop cols
    df_wratio = df_wratio.drop(
        columns=['Sector', 'Employees', 'EmployeeRatio'])

    return df_wratio