示例#1
0
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list):
    """
    From the USDA ERS MLU 2012 report:
    "Includes miscellaneous other uses, such as industrial and commercial sites in rural areas, cemeteries,
    golf courses, mining areas, quarry sites, marshes, swamps, sand dunes, bare rocks, deserts, tundra,
    rural residential, and other unclassified land. In this report, urban land is reported as a separate category."

    Mining data is calculated using a separate source = BLM PLS.
    Want to extract rural residential land area from total value of 'Other Land'
    :param df:
    :param attr:
    :param fbs_list:
    :return:
    """

    from flowsa.values_from_literature import get_area_of_rural_land_occupied_by_houses_2013
    from flowsa.common import load_household_sector_codes

    # land in rural residential lots
    rural_res = get_area_of_rural_land_occupied_by_houses_2013()

    # household codes
    household = load_household_sector_codes()
    household = household['Code'].drop_duplicates().tolist()

    # in df, where sector is a personal expenditure value, and location = 00000, replace with rural res value
    df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household),
                                rural_res, df['FlowAmount'])

    return df
示例#2
0
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list):
    """
    From the USDA ERS MLU 2012 report:
    "Includes miscellaneous other uses, such as industrial and commercial
    sites in rural areas, cemeteries, golf courses, mining areas, quarry sites,
    marshes, swamps, sand dunes, bare rocks, deserts, tundra, rural residential,
    and other unclassified land. In this report, urban land is reported as a separate category."

    Mining data is calculated using a separate source = BLM PLS.
    Want to extract rural residential land area from total value of 'Other Land'
    :param df: df, USDA ERA MLU Land
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, FBS dfs for activities created prior to the activity
                     set that calls on this fxn
    :return: df, allocated USDS ERS MLU Land, FBS format
    """

    # land in rural residential lots
    rural_res = get_area_of_rural_land_occupied_by_houses_2013()

    # household codes
    household = load_household_sector_codes()
    household = household['Code'].drop_duplicates().tolist()

    # in df, where sector is a personal expenditure value, and
    # location = 00000, replace with rural res value
    vLogDetailed.info(
        'The only category for MLU other land use is rural land '
        'occupation. All other land area in this category is unassigned to'
        'sectors, resulting in unaccounted land area.')
    df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household),
                                rural_res, df['FlowAmount'])

    return df
示例#3
0
def add_non_naics_sectors(sector_list, sector_level):

    # load non-NAICS sectors used with NAICS
    household = load_household_sector_codes()
    household = household.loc[household['NAICS_Level_to_Use_For'] == sector_level]
    # add household sector to sector list
    sector_list.extend(household['Code'].tolist())
    # add "None" to sector list so don't lose rows when filtering df to match sector length
    # sector_list.extend(["None"])

    return sector_list
示例#4
0
def expand_naics_list(df, sectorsourcename):
    """
    Add disaggregated sectors to the crosswalks.
    :param df:
    :param sectorsourcename:
    :return:
    """

    # load master crosswalk
    cw = load_sector_crosswalk()
    sectors = cw.loc[:, [sectorsourcename]]
    # drop duplicates
    sectors = sectors.drop_duplicates().dropna()
    # add non-naics to sector list
    household = load_household_sector_codes()
    household = pd.DataFrame(household['Code'].drop_duplicates())
    household.columns = [sectorsourcename]
    sectors = sectors.append(
        household, sort=False).drop_duplicates().reset_index(drop=True)
    # drop rows that contain hyphenated sectors
    sectors = sectors[~sectors[sectorsourcename].str.
                      contains("-")].reset_index(drop=True)
    # Ensure 'None' not added to sectors
    sectors = sectors[sectors[sectorsourcename] != "None"]

    # create list of sectors that exist in original df, which, if created when expanding sector list cannot be added
    existing_sectors = df[['Sector']]
    existing_sectors = existing_sectors.drop_duplicates()

    naics_df = pd.DataFrame([])
    for i in existing_sectors['Sector']:
        dig = len(str(i))
        n = sectors.loc[sectors[sectorsourcename].apply(lambda x: x[0:dig]) ==
                        i]
        if len(n) != 0:
            n = n.assign(Sector=i)
            naics_df = naics_df.append(n)

    # merge df to retain activityname/sectortype info
    naics_expanded = df.merge(naics_df, how='left')
    # drop column of aggregated naics and rename column of disaggregated naics
    naics_expanded = naics_expanded.drop(columns=["Sector"])
    naics_expanded = naics_expanded.rename(
        columns={sectorsourcename: 'Sector'})
    # drop duplicates and rearrange df columns
    naics_expanded = naics_expanded.drop_duplicates()
    naics_expanded = naics_expanded[[
        'ActivitySourceName', 'Activity', 'Sector', 'SectorType'
    ]]

    return naics_expanded
示例#5
0
# concat df list and drop duplications
missing_naics_df = pd.concat(
    missing_naics_df_list, ignore_index=True,
    sort=False).drop_duplicates().reset_index(drop=True)
missing_naics_df = missing_naics_df[
    missing_naics_df['NAICS_2012_Code'] != 'None']
# sort df
missing_naics_df = missing_naics_df.sort_values(
    ['NAICS_2012_Code', 'NAICS_2007_Code'])
missing_naics_df = missing_naics_df.reset_index(drop=True)

# add missing naics to master naics crosswalk
total_naics = naics.append(missing_naics_df, ignore_index=True)

# append household codes
household = load_household_sector_codes()
h = household['Code'].drop_duplicates().tolist()
for i in h:
    if (total_naics['NAICS_2012_Code'] != i).all():
        total_naics = total_naics.append(
            {
                'NAICS_2007_Code': np.nan,
                'NAICS_2012_Code': i,
                'NAICS_2017_Code': np.nan
            },
            ignore_index=True)

# sort df
total_naics = total_naics.sort_values(['NAICS_2012_Code',
                                       'NAICS_2007_Code']).drop_duplicates()
total_naics = total_naics[~total_naics['NAICS_2012_Code'].isin(
示例#6
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)