示例#1
0
def getFlowByActivity(datasource,
                      year,
                      flowclass=None,
                      geographic_level=None,
                      download_FBA_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Retrieves stored data in the FlowByActivity format
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
                             Optional. E.g. 'national', 'state', 'county'.
    :param download_FBA_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: a pandas DataFrame in FlowByActivity format
    """
    # Set fba metadata
    name = flowsa.flowbyactivity.set_fba_name(datasource, year)
    fba_meta = set_fb_meta(name, "FlowByActivity")

    # Try to load a local version of FBA
    fba = load_preprocessed_output(fba_meta, paths)
    # If that didn't work, try to download a remote version of FBA
    if fba is None and download_FBA_if_missing:
        log.info('%s %s not found in %s, downloading from remote source',
                 datasource, str(year), fbaoutputpath)
        download_from_remote(fba_meta, paths)
        fba = load_preprocessed_output(fba_meta, paths)
    # If that didn't work or wasn't allowed, try to construct the FBA
    if fba is None:
        log.info('%s %s not found in %s, running functions to generate FBA',
                 datasource, str(year), fbaoutputpath)
        # Generate the fba
        flowsa.flowbyactivity.main(year=year, source=datasource)
        # Now load the fba
        fba = load_preprocessed_output(fba_meta, paths)
    # If none of the above worked, log an error message
    if fba is None:
        log.error('getFlowByActivity failed, FBA not found')
    # Otherwise (that is, if one of the above methods successfuly loaded the
    # FBA), log it.
    else:
        log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath)

    # Address optional parameters
    if flowclass is not None:
        fba = fba[fba['Class'] == flowclass]
    # if geographic level specified, only load rows in geo level
    if geographic_level is not None:
        fba = filter_by_geoscale(fba, geographic_level)
    return fba
示例#2
0
def check_activities_sector_like(sourcename_load):
    """
    Check if the activities in a df are sector-like,
    if cannot find the sourcename in the source catalog, drop extensions on the
    source name
    """
    sourcename = return_true_source_catalog_name(sourcename_load)

    try:
        sectorLike = load_yaml_dict(
            'source_catalog')[sourcename]['sector-like_activities']
    except KeyError:
        log.error(f'%s or %s not found in {datapath}source_catalog.yaml',
                  sourcename_load, sourcename)

    return sectorLike
示例#3
0
def getFlowBySector(methodname,
                    download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING,
                    download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Loads stored FlowBySector output or generates it if it doesn't exist,
    then loads
    :param methodname: string, Name of an available method for the given class
    :param download_FBAs_if_missing: bool, if True will attempt to load FBAS
        used in generating the FBS from remote server prior to generating if
        file not found locally
    :param download_FBS_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: dataframe in flow by sector format
    """
    fbs_meta = set_fb_meta(methodname, "FlowBySector")
    # Try to load a local version of the FBS
    fbs = load_preprocessed_output(fbs_meta, paths)
    # If that didn't work, try to download a remote version of FBS
    if fbs is None and download_FBS_if_missing:
        log.info('%s not found in %s, downloading from remote source',
                 methodname, fbsoutputpath)
        # download and load the FBS parquet
        subdirectory_dict = {'.log': 'Log'}
        download_from_remote(fbs_meta,
                             paths,
                             subdirectory_dict=subdirectory_dict)
        fbs = load_preprocessed_output(fbs_meta, paths)
    # If that didn't work or wasn't allowed, try to construct the FBS
    if fbs is None:
        log.info('%s not found in %s, running functions to generate FBS',
                 methodname, fbsoutputpath)
        # Generate the fbs, with option to download any required FBAs from
        # Data Commons
        flowsa.flowbysector.main(
            method=methodname,
            download_FBAs_if_missing=download_FBAs_if_missing)
        # Now load the fbs
        fbs = load_preprocessed_output(fbs_meta, paths)
    # If none of the above worked, log an error message
    if fbs is None:
        log.error('getFlowBySector failed, FBS not found')
    # Otherwise (that is, if one of the above methods successfuly loaded the
    # FBS), log it.
    else:
        log.info('Loaded %s from %s', methodname, fbsoutputpath)
    return fbs
示例#4
0
def filter_by_geoscale(df, geoscale):
    """
    Filter flowbyactivity by FIPS at the given scale
    :param df: Either flowbyactivity or flowbysector
    :param geoscale: string, either 'national', 'state', or 'county'
    :return: filtered flowbyactivity or flowbysector
    """

    fips = create_geoscale_list(df, geoscale)

    df = df[df['Location'].isin(fips)].reset_index(drop=True)

    if len(df) == 0:
        log.error("No flows found in the flow dataset at the %s scale",
                  geoscale)
    else:
        return df
示例#5
0
def load_yaml_dict(filename, flowbytype=None):
    """
    Load the information in a yaml file, from source_catalog, or FBA,
    or FBS files
    :return: dictionary containing all information in yaml
    """
    if filename == 'source_catalog':
        folder = datapath
    else:
        if flowbytype == 'FBA':
            folder = sourceconfigpath
        elif flowbytype == 'FBS':
            folder = flowbysectormethodpath
        else:
            raise KeyError('Must specify either \'FBA\' or \'FBS\'')
    yaml_path = folder + filename + '.yaml'

    try:
        with open(yaml_path, 'r') as f:
            config = yaml.safe_load(f)
    except IOError:
        log.error('%s method file not found', flowbytype)

    # Allow for .yaml files to recursively inherit other .yaml files. Keys in
    # children will overwrite the same key from a parent.
    inherits = config.get('inherits_from')
    while inherits:
        yaml_path = folder + inherits + '.yaml'
        with open(yaml_path, 'r') as f:
            parent = yaml.safe_load(f)

        # Check for common keys and log a warning if any are found
        common_keys = [k for k in config if k in parent]
        if common_keys:
            log.warning(f'Keys {common_keys} from parent file {yaml_path} '
                        f'were overwritten by child file.')

        # Update inheritance information before updating the parent dict
        inherits = parent.get('inherits_from')
        parent.update(config)
        config = parent

    return config
示例#6
0
def load_api_key(api_source):
    """
    Loads an API Key from "API_Keys.env" file using the
    'api_name' defined in the FBA source config file. The '.env' file contains
    the users personal API keys. The user must register with this
    API and get the key and manually add to "API_Keys.env"

    See wiki for how to get an api:
    https://github.com/USEPA/flowsa/wiki/Using-FLOWSA#api-keys

    :param api_source: str, name of source, like 'BEA' or 'Census'
    :return: the users API key as a string
    """
    load_dotenv(f'{MODULEPATH}API_Keys.env', verbose=True)
    key = os.getenv(api_source)
    if key is None:
        log.error(f"Key file {api_source} not found. See github wiki for help "
                  "https://github.com/USEPA/flowsa/wiki/Using-FLOWSA#api-keys")
    return key
示例#7
0
def collapse_activity_fields(df):
    """
    The 'activityconsumedby' and 'activityproducedby' columns from the
    allocation dataset do not always align with
    the dataframe being allocated. Generalize the allocation activity column.
    :param df: df, FBA used to allocate another FBA
    :return: df, single Activity column
    """

    df = replace_strings_with_NoneType(df)

    activity_consumed_list = \
        df['ActivityConsumedBy'].drop_duplicates().values.tolist()
    activity_produced_list = \
        df['ActivityProducedBy'].drop_duplicates().values.tolist()

    # if an activity field column is all 'none', drop the column and
    # rename renaming activity columns to generalize
    if all(v is None for v in activity_consumed_list):
        df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy'])
        df = df.rename(columns={
            'ActivityProducedBy': 'Activity',
            'SectorProducedBy': 'Sector'
        })
    elif all(v is None for v in activity_produced_list):
        df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy'])
        df = df.rename(columns={
            'ActivityConsumedBy': 'Activity',
            'SectorConsumedBy': 'Sector'
        })
    else:
        log.error('Cannot generalize dataframe')

    # drop other columns
    df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType'])

    return df
示例#8
0
def getFIPS(state=None, county=None, year='2015'):
    """
    Pass a state or state and county name to get the FIPS.

    :param state: str. A US State Name or Puerto Rico, any case accepted
    :param county: str. A US county
    :param year: str. '2010', '2013', '2015', default year is 2015
    :return: str. A five digit FIPS code
    """
    FIPS_df = read_stored_FIPS(year)

    # default code
    code = None

    if county is None:
        if state is not None:
            state = clean_str_and_capitalize(state)
            code = FIPS_df.loc[(FIPS_df["State"] == state)
                               & (FIPS_df["County"].isna()), "FIPS"]
        else:
            log.error("To get state FIPS, state name must be passed in "
                      "'state' param")
    else:
        if state is None:
            log.error("To get county FIPS, state name must be passed in "
                      "'state' param")
        else:
            state = clean_str_and_capitalize(state)
            county = clean_str_and_capitalize(county)
            code = FIPS_df.loc[(FIPS_df["State"] == state)
                               & (FIPS_df["County"] == county), "FIPS"]
    if code.empty:
        log.error("No FIPS code found")
    else:
        code = code.values[0]

    return code
示例#9
0
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons or run locally
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = \
        load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                           df_year=attr['helper_source_year'],
                           flowclass=attr['helper_source_class'],
                           geoscale_from=attr['helper_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = \
        helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = \
        helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = \
        helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = \
        df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = \
        df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base '
                  'merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors,
    # depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and \
            (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = \
            df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                how='left',
                left_on=['Location_tmp', sector_col_to_merge],
                right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = \
            modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = \
            df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                              how='left',
                              left_on=[sector_col_to_merge],
                              right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location', 'Sector', 'HelperFlow']],
                left_on=['Location', sector_col_to_merge],
                right_on=['Location', 'Sector'],
                how='left')
        # load bea codes that sub for naics
        bea = return_bea_codes_used_as_naics()
        # replace sector column and helperflow value if the sector column to
        # merge is in the bea list to prevent dropped data
        modified_fba_allocation['Sector'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation[sector_col_to_merge],
                     modified_fba_allocation['Sector'])
        modified_fba_allocation['HelperFlow'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation['FlowAmount'],
                     modified_fba_allocation['HelperFlow'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = \
            replacement_values.rename(
                columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = \
            modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0,
        # so after multiplying, don't have incorrect value associated with
        # new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(
                modified_fba_allocation, sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                Denominator=modified_fba_allocation.groupby(
                    ['FlowName', 'ActivityConsumedBy', 'Location',
                     'disaggregate_flag'])['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                FlowAmount=modified_fba_allocation['FlowAmount'] *
                           modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                         'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = \
            sector_aggregation(modified_fba_allocation,
                               fba_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[
            modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] ==
                                'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(
                attr['allocation_source'], attr["scale_helper_results"])(
                modified_fba_allocation, attr,
                download_FBA_if_missing=download_FBA_if_missing)
    return modified_fba_allocation
示例#10
0
def blm_pls_call(*, resp, year, **_):
    """
    Convert response for calling url to pandas dataframe, begin parsing
    df into FBA format
    :param resp: Response, response from url call
    :param year: year
    :return: pandas dataframe of original source data
    """
    df = pd.DataFrame()
    sub_headers = {}

    skip = False
    last_row_header = ""
    next_line = False
    copy = False
    location_str = []
    flow_value = []
    flow_name = []
    number_of_sub_headers = 0

    duplicate_headers = [
        "Pre-Reform Act Future Interest Leases", "Reform Act Leases",
        "Reform Act Future Interest Leases"
    ]

    if year == "2007":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [99],
                "Acquired Lands": [99]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain & Acquired Lands": [100, 109, 110]
            },
            "Reform Act Leases": {
                "Public Domain": [101, 110],
                "Acquired Lands": [101, 102]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [111]
            },
            "Reform Act Future Interest Leases": {
                "Public Domain & Acquired Lands": [103],
                "Acquired Lands": [112]
            },
            "Competitive General Services Administration (GSA) "
            "Oil & Gas Leases": {
                "Public Domain": [103]
            },
            "Competitive Protective Leases": {
                "Public Domain & Acquired Lands": [103]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [104]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [104]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain & Acquired Lands": [104]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain & Acquired Lands": [104]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [106],
                "Acquired Lands": [106, 107]
            },
            "Pre-Reform Act Simultaneous Leases": {
                "Acquired Lands": [108, 109]
            },
            "Summary: Pre-Reform Act Simultaneous Leases": {
                "Public Domain & Acquired Lands": [109]
            },
            "Geothermal Leases": {
                "Public Domain & Acquired Lands": [112]
            },
            "Private Leases": {
                "Acquired Lands": [114]
            },
            "Exchange Leases": {
                "Public Domain": [114]
            },
            "Renewal Leases": {
                "Public Domain": [114]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [115]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [115]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage Corporation "
            "Act of 1934": {
                "Acquired Lands": [115]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [115]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [122],
                "Competitive Pre-Federal Coal Leasing Amendment Act "
                "(FCLAA) Leases": [122],
                "Competitive Regional Emergency/Bypass Leases": [122],
                "Competitive Regional Leases": [123],
                "Exchange Leases": [123],
                "Preference Right Leases": [123]
            },
            "Coal Licenses": {
                "Exploration Licenses": [124],
                "Licenses to Mine": [124]
            },
            "Logical Mining Units": {
                "None": [124]
            },
            "Combined Hydrocarbon Leases": {
                "None": [126]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [126],
                "Phosphate Fringe Acreage Noncompetitive Leases": [126],
                "Phosphate Preference Right Leases": [126]
            },
            "Phosphate Use Permits": {
                "None": [127]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [127],
                "Sodium Fringe Acreage Noncompetitive Leases": [127],
                "Sodium Preference Right Leases": [127]
            },
            "Sodium Use Permit": {
                "None": [127]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [128],
                "Potassium Fringe Acreage Noncompetitive Leases": [128],
                "Potassium Preference Right Leases": [128]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [128],
                "Gilsonite Fringe Acreage Noncompetitive Lease": [129],
                "Gilsonite Preference Right Leases": [129]
            },
            "Oil Shale Leases": {
                "Oil Shale R, D&D Leases": [129]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [130]
            },
            "Asphalt Competitive Leases": {
                "None": [130]
            }
        }
        competitive_page_numbers = [100, 101, 102]
        no_header_page_numbers = [123, 129]
    elif year == "2011":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [111],
                "Acquired Lands": [111, 112]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain and Acquired Lands": [113, 122]
            },
            "Reform Act Leases": {
                "Public Domain": [113, 123],
                "Acquired Lands": [123, 124]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [114]
            },
            "Competitive General Services Administration (GSA) "
            "Oil and Gas Leases": {
                "Public Domain": [116]
            },
            "Competitive Protective Leases": {
                "Public Domain and Acquired Lands": [116]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [116]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [116]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [117]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [117]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [119],
                "Acquired Lands": [119]
            },
            "Pre-Reform Act Simultaneous Leases—continued": {
                "Acquired Lands": [120, 121]
            },
            "Summary:  Pre-Reform Act Simultaneous Leases": {
                "Public Domain and Acquired Lands": [122]
            },
            "Reform Act Future Interest Leases": {
                "Acquired Lands": [125]
            },
            "Geothermal Leases": {
                "Public Domain and Acquired Lands": [125]
            },
            "Private Leases": {
                "Acquired Lands": [126]
            },
            "Exchange Leases": {
                "Public Domain": [126]
            },
            "Renewal Leases": {
                "Public Domain": [126, 127]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [127]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [127, 128]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage "
            "Corporation Act of 1934": {
                "Acquired Lands": [128]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [128]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [135],
                "Competitive Pre-Federal Coal Leasing Amendment "
                "Act (FCLAA) Leases": [135],
                "Competitive Regional Emergency/Bypass Leases": [135],
                "Competitive Regional Leases": [136],
                "Exchange Leases": [136],
                "Preference Right Leases": [136]
            },
            "Coal Licenses": {
                "Exploration Licenses": [137],
                "Licenses To Mine": [137]
            },
            "Logical Mining Units": {
                "None": [137]
            },
            "Combined Hydrocarbon Leases": {
                "None": [139]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [139],
                "Phosphate Fringe Acreage Noncompetitive Leases": [139],
                "Phosphate Preference Right Leases": [139]
            },
            "Phosphate Use Permits": {
                "None": [139]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [140],
                "Sodium Fringe Acreage Noncompetitive Leases": [140],
                "Sodium Preference Right Leases": [140]
            },
            "Sodium Use Permit": {
                "None": [140]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [141],
                "Potassium Fringe Acreage Noncompetitive Leases": [141],
                "Potassium Preference Right Leases": [141]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [142],
                "Gilsonite Fringe Acreage Noncompetitive Leases": [142],
                "Gilsonite Preference Right Leases": [142]
            },
            "Oil Shale RD&D Leases": {
                "None": [142]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [143]
            }
        }
        competitive_page_numbers = [113, 114]
        no_header_page_numbers = [136]
    elif year == "2012":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [108],
                "Acquired Lands": [108, 109]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain and Acquired Lands": [110, 119]
            },
            "Reform Act Leases": {
                "Public Domain": [110, 120],
                "Acquired Lands": [110]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [111]
            },
            "Competitive General Services Administration (GSA) "
            "Oil and Gas Leases": {
                "Public Domain": [113]
            },
            "Competitive Protective Leases": {
                "Public Domain and Acquired Lands": [113]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [113]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [113]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [114]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [114]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [116],
                "Acquired Lands": [116]
            },
            "Pre-Reform Act Simultaneous Leases": {
                "Public Domain": [117]
            },
            "Pre-Reform Act Simultaneous Leases—continued": {
                "Public Domain": [118],
                "Acquired Lands": [118]
            },
            "Summary: Pre-Reform Act Simultaneous Leases": {
                "Public Domain and Acquired Lands": [119]
            },
            "Reform Act Future Interest Leases": {
                "Acquired Lands": [122]
            },
            "Geothermal Leases": {
                "Public Domain and Acquired Lands": [122]
            },
            "Private Leases": {
                "Acquired Lands": [124]
            },
            "Exchange Leases": {
                "Public Domain": [124]
            },
            "Renewal Leases": {
                "Public Domain": [124, 125]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [125]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [125, 126]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage Corporation "
            "Act of 1934": {
                "Acquired Lands": [126]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [126]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [133],
                "Competitive Pre-Federal Coal Leasing Amendment Act "
                "(FCLAA) Leases": [133],
                "Competitive Regional Emergency/Bypass Leases": [133],
                "Competitive Regional Leases": [134],
                "Exchange Leases": [134],
                "Preference Right Leases": [134]
            },
            "Coal Licenses": {
                "Exploration Licenses": [135],
                "Licenses To Mine": [135]
            },
            "Logical Mining Units": {
                "None": [135]
            },
            "Combined Hydrocarbon Leases": {
                "None": [137]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [137],
                "Phosphate Fringe Acreage Noncompetitive Leases": [137],
                "Phosphate Preference Right Leases": [137]
            },
            "Phosphate Use Permits": {
                "None": [137]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [138],
                "Sodium Fringe Acreage Noncompetitive Leases": [138],
                "Sodium Preference Right Leases": [138]
            },
            "Sodium Use Permit": {
                "None": [138]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [139],
                "Potassium Fringe Acreage Noncompetitive Leases": [139],
                "Potassium Preference Right Leases": [139]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [140],
                "Gilsonite Fringe Acreage Noncompetitive Leases": [140],
                "Gilsonite Preference Right Leases": [140]
            },
            "Oil Shale RD&D Leases": {
                "None": [140]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [141]
            }
        }
        competitive_page_numbers = [110, 111]
        no_header_page_numbers = [134]
    else:
        # provide reasoning for failure of parsing data
        log.error('Missing code specifying sub-headers, '
                  'add code to blm_pls_call()')

    for header in sub_headers:
        for sub_header in sub_headers[header]:
            pg = sub_headers[header][sub_header]
            pdf_pages = []
            for page_number in pg:
                found_header = False

                pdf_page = tabula.read_pdf(io.BytesIO(resp.content),
                                           pages=page_number,
                                           stream=True,
                                           guess=False)[0]

                if pdf_page.shape[1] == 1:
                    pdf_page.columns = ["one"]
                else:
                    pdf_page.columns = ["one", "two"]

                pdf_page.dropna(subset=["one"], inplace=True)
                # add col of page number
                pdf_page['page_no'] = page_number
                pdf_pages.append(pdf_page)

            for page in pdf_pages:
                for index, row in page.iterrows():
                    if " /" in row["one"]:
                        split_header = row["one"].split(" /")
                        split_row = split_header[0].strip()
                    else:
                        split_row = row["one"]
                    # if page_number in no_header_page_numbers:
                    if row['page_no'] in no_header_page_numbers:
                        # if pages in no_header_page_numbers:
                        found_header = True
                    if split_row == header:
                        found_header = True
                        last_row_header = header
                    if split_row == sub_header and last_row_header == header:
                        copy = True
                    elif sub_header == "None" and last_row_header == header:
                        copy = True

                    if copy and split_row != sub_header and \
                            split_row != header and found_header:
                        if "FISCAL" in row["one"] or row["one"].isdigit():
                            skip = True

                        if not skip:
                            if sub_header == "None":
                                sub_header = ""
                            lists = split(row, header, sub_header, next_line)
                            if header in duplicate_headers:
                                # if page_number in competitive_page_numbers:
                                if row['page_no'] in competitive_page_numbers:
                                    flow_name.append(f"Competitive {lists[1]}")
                                else:
                                    flow_name.append(f"Noncompetitive "
                                                     f"{lists[1]}")
                            else:
                                flow_name.append(lists[1])
                            location_str.append(lists[0])
                            flow_value.append(lists[2])
                            if next_line:
                                copy = False
                                next_line = False
                                header = "Nothing"
                            if "Total" in row["one"]:
                                row_one_str = ""
                                if any(i.isdigit() for i in row["one"]):
                                    #   row split based on space
                                    row_one_split = row["one"].split(" ")
                                    for r in row_one_split:
                                        if not any(d.isdigit() for d in r):
                                            row_one_str = row_one_str + " " + r
                                else:
                                    row_one_str = row["one"]

                                if pdf_page.shape[1] == 1 and \
                                        row["one"] == "Total":
                                    next_line = True
                                elif row_one_str.strip() == "Total" or \
                                        "Leases" in row["one"] or "None" in \
                                        row["one"]:
                                    number_of_sub_headers = \
                                        number_of_sub_headers + 1
                                    copy = False
                                    found_header = False
                                else:
                                    next_line = True
                        if sub_header + "—continued" in row["one"]:
                            skip = False

    df["LocationStr"] = location_str
    df["ActivityConsumedBy"] = flow_name
    df["FlowAmount"] = flow_value

    return df
示例#11
0
def allocate_by_sector(df_w_sectors, attr, allocation_method, group_cols,
                       **kwargs):
    """
    Create an allocation ratio for df
    :param df_w_sectors: df with column of sectors
    :param attr: dictionary, attributes of activity set
    :param allocation_method: currently written for 'proportional'
         and 'proportional-flagged'
    :param group_cols: columns on which to base aggregation and disaggregation
    :return: df with FlowAmountRatio for each sector
    """

    # first determine if there is a special case with how
    # the allocation ratios are created
    if allocation_method == 'proportional-flagged':
        # if the allocation method is flagged, subset sectors that are
        # flagged/notflagged, where nonflagged sectors have flowamountratio=1
        if kwargs != {}:
            if 'flowSubsetMapped' in kwargs:
                fsm = kwargs['flowSubsetMapped']
                flagged = fsm[fsm['disaggregate_flag'] == 1]
                if flagged['SectorProducedBy'].isna().all():
                    sector_col = 'SectorConsumedBy'
                else:
                    sector_col = 'SectorProducedBy'
                flagged_names = flagged[sector_col].tolist()

                nonflagged = fsm[fsm['disaggregate_flag'] == 0]
                nonflagged_names = nonflagged[sector_col].tolist()

                # subset the original df so rows of data that run through the
                # proportional allocation process are
                # sectors included in the flagged list
                df_w_sectors_nonflagged = df_w_sectors.loc[(
                    df_w_sectors[fbs_activity_fields[0]].isin(nonflagged_names)
                ) | (df_w_sectors[fbs_activity_fields[1]].
                     isin(nonflagged_names))].reset_index(drop=True)
                df_w_sectors_nonflagged = \
                    df_w_sectors_nonflagged.assign(FlowAmountRatio=1)

                df_w_sectors = \
                    df_w_sectors.loc[(df_w_sectors[fbs_activity_fields[0]]
                                      .isin(flagged_names)) |
                                     (df_w_sectors[fbs_activity_fields[1]]
                                      .isin(flagged_names)
                                      )].reset_index(drop=True)
            else:
                log.error('The proportional-flagged allocation '
                          'method requires a column "disaggregate_flag" '
                          'in the flow_subset_mapped df')

    # run sector aggregation fxn to determine total flowamount
    # for each level of sector
    if len(df_w_sectors) == 0:
        return df_w_sectors_nonflagged
    else:
        df1 = sector_aggregation(df_w_sectors, group_cols)
        # run sector disaggregation to capture one-to-one
        # naics4/5/6 relationships
        df2 = sector_disaggregation(df1)

        # if statements for method of allocation
        # either 'proportional' or 'proportional-flagged'
        allocation_df = []
        if allocation_method in ('proportional', 'proportional-flagged'):
            allocation_df = proportional_allocation(df2, attr)
        else:
            log.error('Must create function for specified '
                      'method of allocation')

        if allocation_method == 'proportional-flagged':
            # drop rows where values are not in flagged names
            allocation_df =\
                allocation_df.loc[(allocation_df[fbs_activity_fields[0]]
                                   .isin(flagged_names)) |
                                  (allocation_df[fbs_activity_fields[1]]
                                   .isin(flagged_names)
                                   )].reset_index(drop=True)
            # concat the flagged and nonflagged dfs
            allocation_df = \
                pd.concat([allocation_df, df_w_sectors_nonflagged],
                          ignore_index=True).sort_values(['SectorProducedBy',
                                                          'SectorConsumedBy'])

        return allocation_df
示例#12
0
def return_activity_from_scale(df, provided_from_scale):
    """
    Determine the 'from scale' used for aggregation/df
    subsetting for each activity combo in a df
    :param df: flowbyactivity df
    :param provided_from_scale: str, The scale to use specified in method yaml
    :return: df, FBA with column indicating the "from" geoscale to
        use for each row
    """

    # determine the unique combinations of activityproduced/consumedby
    unique_activities = unique_activity_names(df)
    # filter by geoscale
    fips = create_geoscale_list(df, provided_from_scale)
    # determine unique activities after subsetting by geoscale
    unique_activities_sub = \
        unique_activity_names(df[df['Location'].isin(fips)])

    # return df of the difference between unique_activities
    # and unique_activities2
    df_missing = dataframe_difference(unique_activities,
                                      unique_activities_sub,
                                      which='left_only')
    # return df of the similarities between unique_activities
    # and unique_activities2
    df_existing = dataframe_difference(unique_activities,
                                       unique_activities_sub,
                                       which='both')
    df_existing = df_existing.drop(columns='_merge')
    df_existing['activity_from_scale'] = provided_from_scale

    if len(df_missing) > 0:
        # for loop through geoscales until find data for each activity combo
        if provided_from_scale == 'national':
            geoscales = ['state', 'county']
        elif provided_from_scale == 'state':
            geoscales = ['county']
        elif provided_from_scale == 'county':
            log.error('Missing county level data')

        for i in geoscales:
            # filter by geoscale
            fips_i = create_geoscale_list(df, i)
            df_i = df[df['Location'].isin(fips_i)]

            # determine unique activities after subsetting by geoscale
            unique_activities_i = unique_activity_names(df_i)

            # return df of the difference between unique_activities subset and
            # unique_activities for geoscale
            df_missing_i = dataframe_difference(unique_activities_sub,
                                                unique_activities_i,
                                                which='right_only')
            df_missing_i = df_missing_i.drop(columns='_merge')
            df_missing_i['activity_from_scale'] = i
            # return df of the similarities between unique_activities
            # and unique_activities2
            df_existing_i = dataframe_difference(unique_activities_sub,
                                                 unique_activities_i,
                                                 which='both')

            # append unique activities and df with defined activity_from_scale
            unique_activities_sub = unique_activities_sub.append(
                df_missing_i[[fba_activity_fields[0], fba_activity_fields[1]]])
            df_existing = df_existing.append(df_missing_i)
            df_missing = dataframe_difference(
                df_missing[[fba_activity_fields[0], fba_activity_fields[1]]],
                df_existing_i[[fba_activity_fields[0],
                               fba_activity_fields[1]]],
                which=None)

    return df_existing
示例#13
0
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs):
    """
    Subset the fba allocation data based on NAICS associated with activity
    :param fba_allocation: df, FBA format
    :param source: str, source name
    :param activitynames: list, activity names in activity set
    :param kwargs: can be the mapping file and method of allocation
    :return: df, FBA subset
    """
    # first determine if there are special cases that would modify the
    # typical method of subset an example of a special case is when the
    # allocation method is 'proportional-flagged'
    subset_by_sector_cols = False
    subset_by_column_value = False
    if kwargs != {}:
        if 'flowSubsetMapped' in kwargs:
            fsm = kwargs['flowSubsetMapped']
        if 'allocMethod' in kwargs:
            am = kwargs['allocMethod']
            if am == 'proportional-flagged':
                subset_by_sector_cols = True
        if 'activity_set_names' in kwargs:
            asn = kwargs['activity_set_names']
            if asn is not None:
                if 'allocation_subset_col' in asn:
                    subset_by_column_value = True

    if check_activities_sector_like(source) is False:
        # read in source crosswalk
        df = get_activitytosector_mapping(source)
        sec_source_name = df['SectorSourceName'][0]
        df = expand_naics_list(df, sec_source_name)
        # subset source crosswalk to only contain values
        # pertaining to list of activity names
        df = df.loc[df['Activity'].isin(activitynames)]
        # turn column of sectors related to activity names into list
        sector_list = pd.unique(df['Sector']).tolist()
        # subset fba allocation table to the values in
        # the activity list, based on overlapping sectors
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    sector_list)].reset_index(drop=True)
        else:
            fba_allocation_subset = \
                fba_allocation.loc[
                    (fba_allocation[fbs_activity_fields[0]].isin(sector_list)
                     ) |
                    (fba_allocation[fbs_activity_fields[1]].isin(sector_list)
                     )].reset_index(drop=True)
    else:
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    activitynames)].reset_index(drop=True)
        elif subset_by_sector_cols:
            # if it is a special case, then base the subset of data on
            # sectors in the sector columns, not on activitynames
            fsm_sub = fsm.loc[
                (fsm[fba_activity_fields[0]].isin(activitynames)) |
                (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index(
                    drop=True)
            part1 = fsm_sub[['SectorConsumedBy']]
            part2 = fsm_sub[['SectorProducedBy']]
            part1.columns = ['Sector']
            part2.columns = ['Sector']
            modified_activitynames = \
                pd.concat([part1, part2], ignore_index=True).drop_duplicates()
            modified_activitynames = modified_activitynames[
                modified_activitynames['Sector'].notnull()]
            modified_activitynames = modified_activitynames['Sector'].tolist()
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].
                 isin(modified_activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].
                 isin(modified_activitynames))].reset_index(drop=True)

        else:
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].isin(activitynames)
                 )].reset_index(drop=True)

    # if activity set names included in function call and activity set names
    # is not null, then subset data based on value and column specified
    if subset_by_column_value:
        # create subset of activity names and allocation subset metrics
        asn_subset = \
            asn[asn['name'].isin(activitynames)].reset_index(drop=True)
        if asn_subset['allocation_subset'].isna().all():
            pass
        elif asn_subset['allocation_subset'].isna().any():
            log.error('Define column and value to subset on in the activity '
                      'set csv for all rows')
        else:
            col_to_subset = asn_subset['allocation_subset_col'][0]
            val_to_subset = asn_subset['allocation_subset'][0]
            # subset fba_allocation_subset further
            log.debug('Subset the allocation dataset where %s = %s',
                      str(col_to_subset), str(val_to_subset))
            fba_allocation_subset = fba_allocation_subset[
                fba_allocation_subset[col_to_subset] ==
                val_to_subset].reset_index(drop=True)

    return fba_allocation_subset