def getFlowByActivity(datasource, year, flowclass=None, geographic_level=None, download_FBA_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Retrieves stored data in the FlowByActivity format :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :param download_FBA_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: a pandas DataFrame in FlowByActivity format """ # Set fba metadata name = flowsa.flowbyactivity.set_fba_name(datasource, year) fba_meta = set_fb_meta(name, "FlowByActivity") # Try to load a local version of FBA fba = load_preprocessed_output(fba_meta, paths) # If that didn't work, try to download a remote version of FBA if fba is None and download_FBA_if_missing: log.info('%s %s not found in %s, downloading from remote source', datasource, str(year), fbaoutputpath) download_from_remote(fba_meta, paths) fba = load_preprocessed_output(fba_meta, paths) # If that didn't work or wasn't allowed, try to construct the FBA if fba is None: log.info('%s %s not found in %s, running functions to generate FBA', datasource, str(year), fbaoutputpath) # Generate the fba flowsa.flowbyactivity.main(year=year, source=datasource) # Now load the fba fba = load_preprocessed_output(fba_meta, paths) # If none of the above worked, log an error message if fba is None: log.error('getFlowByActivity failed, FBA not found') # Otherwise (that is, if one of the above methods successfuly loaded the # FBA), log it. else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) # Address optional parameters if flowclass is not None: fba = fba[fba['Class'] == flowclass] # if geographic level specified, only load rows in geo level if geographic_level is not None: fba = filter_by_geoscale(fba, geographic_level) return fba
def check_activities_sector_like(sourcename_load): """ Check if the activities in a df are sector-like, if cannot find the sourcename in the source catalog, drop extensions on the source name """ sourcename = return_true_source_catalog_name(sourcename_load) try: sectorLike = load_yaml_dict( 'source_catalog')[sourcename]['sector-like_activities'] except KeyError: log.error(f'%s or %s not found in {datapath}source_catalog.yaml', sourcename_load, sourcename) return sectorLike
def getFlowBySector(methodname, download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING, download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Loads stored FlowBySector output or generates it if it doesn't exist, then loads :param methodname: string, Name of an available method for the given class :param download_FBAs_if_missing: bool, if True will attempt to load FBAS used in generating the FBS from remote server prior to generating if file not found locally :param download_FBS_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: dataframe in flow by sector format """ fbs_meta = set_fb_meta(methodname, "FlowBySector") # Try to load a local version of the FBS fbs = load_preprocessed_output(fbs_meta, paths) # If that didn't work, try to download a remote version of FBS if fbs is None and download_FBS_if_missing: log.info('%s not found in %s, downloading from remote source', methodname, fbsoutputpath) # download and load the FBS parquet subdirectory_dict = {'.log': 'Log'} download_from_remote(fbs_meta, paths, subdirectory_dict=subdirectory_dict) fbs = load_preprocessed_output(fbs_meta, paths) # If that didn't work or wasn't allowed, try to construct the FBS if fbs is None: log.info('%s not found in %s, running functions to generate FBS', methodname, fbsoutputpath) # Generate the fbs, with option to download any required FBAs from # Data Commons flowsa.flowbysector.main( method=methodname, download_FBAs_if_missing=download_FBAs_if_missing) # Now load the fbs fbs = load_preprocessed_output(fbs_meta, paths) # If none of the above worked, log an error message if fbs is None: log.error('getFlowBySector failed, FBS not found') # Otherwise (that is, if one of the above methods successfuly loaded the # FBS), log it. else: log.info('Loaded %s from %s', methodname, fbsoutputpath) return fbs
def filter_by_geoscale(df, geoscale): """ Filter flowbyactivity by FIPS at the given scale :param df: Either flowbyactivity or flowbysector :param geoscale: string, either 'national', 'state', or 'county' :return: filtered flowbyactivity or flowbysector """ fips = create_geoscale_list(df, geoscale) df = df[df['Location'].isin(fips)].reset_index(drop=True) if len(df) == 0: log.error("No flows found in the flow dataset at the %s scale", geoscale) else: return df
def load_yaml_dict(filename, flowbytype=None): """ Load the information in a yaml file, from source_catalog, or FBA, or FBS files :return: dictionary containing all information in yaml """ if filename == 'source_catalog': folder = datapath else: if flowbytype == 'FBA': folder = sourceconfigpath elif flowbytype == 'FBS': folder = flowbysectormethodpath else: raise KeyError('Must specify either \'FBA\' or \'FBS\'') yaml_path = folder + filename + '.yaml' try: with open(yaml_path, 'r') as f: config = yaml.safe_load(f) except IOError: log.error('%s method file not found', flowbytype) # Allow for .yaml files to recursively inherit other .yaml files. Keys in # children will overwrite the same key from a parent. inherits = config.get('inherits_from') while inherits: yaml_path = folder + inherits + '.yaml' with open(yaml_path, 'r') as f: parent = yaml.safe_load(f) # Check for common keys and log a warning if any are found common_keys = [k for k in config if k in parent] if common_keys: log.warning(f'Keys {common_keys} from parent file {yaml_path} ' f'were overwritten by child file.') # Update inheritance information before updating the parent dict inherits = parent.get('inherits_from') parent.update(config) config = parent return config
def load_api_key(api_source): """ Loads an API Key from "API_Keys.env" file using the 'api_name' defined in the FBA source config file. The '.env' file contains the users personal API keys. The user must register with this API and get the key and manually add to "API_Keys.env" See wiki for how to get an api: https://github.com/USEPA/flowsa/wiki/Using-FLOWSA#api-keys :param api_source: str, name of source, like 'BEA' or 'Census' :return: the users API key as a string """ load_dotenv(f'{MODULEPATH}API_Keys.env', verbose=True) key = os.getenv(api_source) if key is None: log.error(f"Key file {api_source} not found. See github wiki for help " "https://github.com/USEPA/flowsa/wiki/Using-FLOWSA#api-keys") return key
def collapse_activity_fields(df): """ The 'activityconsumedby' and 'activityproducedby' columns from the allocation dataset do not always align with the dataframe being allocated. Generalize the allocation activity column. :param df: df, FBA used to allocate another FBA :return: df, single Activity column """ df = replace_strings_with_NoneType(df) activity_consumed_list = \ df['ActivityConsumedBy'].drop_duplicates().values.tolist() activity_produced_list = \ df['ActivityProducedBy'].drop_duplicates().values.tolist() # if an activity field column is all 'none', drop the column and # rename renaming activity columns to generalize if all(v is None for v in activity_consumed_list): df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy']) df = df.rename(columns={ 'ActivityProducedBy': 'Activity', 'SectorProducedBy': 'Sector' }) elif all(v is None for v in activity_produced_list): df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy']) df = df.rename(columns={ 'ActivityConsumedBy': 'Activity', 'SectorConsumedBy': 'Sector' }) else: log.error('Cannot generalize dataframe') # drop other columns df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType']) return df
def getFIPS(state=None, county=None, year='2015'): """ Pass a state or state and county name to get the FIPS. :param state: str. A US State Name or Puerto Rico, any case accepted :param county: str. A US county :param year: str. '2010', '2013', '2015', default year is 2015 :return: str. A five digit FIPS code """ FIPS_df = read_stored_FIPS(year) # default code code = None if county is None: if state is not None: state = clean_str_and_capitalize(state) code = FIPS_df.loc[(FIPS_df["State"] == state) & (FIPS_df["County"].isna()), "FIPS"] else: log.error("To get state FIPS, state name must be passed in " "'state' param") else: if state is None: log.error("To get county FIPS, state name must be passed in " "'state' param") else: state = clean_str_and_capitalize(state) county = clean_str_and_capitalize(county) code = FIPS_df.loc[(FIPS_df["State"] == state) & (FIPS_df["County"] == county), "FIPS"] if code.empty: log.error("No FIPS code found") else: code = code.values[0] return code
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons or run locally :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = \ load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = \ helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = \ helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = \ helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = \ df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = \ df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base ' 'merge with helper allocation dataset') # merge allocation df with helper df based on sectors, # depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and \ (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = \ df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = \ modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = \ df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector'], how='left') # load bea codes that sub for naics bea = return_bea_codes_used_as_naics() # replace sector column and helperflow value if the sector column to # merge is in the bea list to prevent dropped data modified_fba_allocation['Sector'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation[sector_col_to_merge], modified_fba_allocation['Sector']) modified_fba_allocation['HelperFlow'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation['FlowAmount'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = \ replacement_values.rename( columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = \ modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, # so after multiplying, don't have incorrect value associated with # new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity( modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop( columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign( Denominator=modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'])['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign( FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop( columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = \ sector_aggregation(modified_fba_allocation, fba_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn( attr['allocation_source'], attr["scale_helper_results"])( modified_fba_allocation, attr, download_FBA_if_missing=download_FBA_if_missing) return modified_fba_allocation
def blm_pls_call(*, resp, year, **_): """ Convert response for calling url to pandas dataframe, begin parsing df into FBA format :param resp: Response, response from url call :param year: year :return: pandas dataframe of original source data """ df = pd.DataFrame() sub_headers = {} skip = False last_row_header = "" next_line = False copy = False location_str = [] flow_value = [] flow_name = [] number_of_sub_headers = 0 duplicate_headers = [ "Pre-Reform Act Future Interest Leases", "Reform Act Leases", "Reform Act Future Interest Leases" ] if year == "2007": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [99], "Acquired Lands": [99] }, "Pre-Reform Act Future Interest Leases": { "Public Domain & Acquired Lands": [100, 109, 110] }, "Reform Act Leases": { "Public Domain": [101, 110], "Acquired Lands": [101, 102] }, "Reform Act Leases—continued": { "Acquired Lands": [111] }, "Reform Act Future Interest Leases": { "Public Domain & Acquired Lands": [103], "Acquired Lands": [112] }, "Competitive General Services Administration (GSA) " "Oil & Gas Leases": { "Public Domain": [103] }, "Competitive Protective Leases": { "Public Domain & Acquired Lands": [103] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [104] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [104] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain & Acquired Lands": [104] }, "EPAct Competitive Geothermal Leases": { "Public Domain & Acquired Lands": [104] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [106], "Acquired Lands": [106, 107] }, "Pre-Reform Act Simultaneous Leases": { "Acquired Lands": [108, 109] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain & Acquired Lands": [109] }, "Geothermal Leases": { "Public Domain & Acquired Lands": [112] }, "Private Leases": { "Acquired Lands": [114] }, "Exchange Leases": { "Public Domain": [114] }, "Renewal Leases": { "Public Domain": [114] }, "Class III Reinstatement Leases": { "Public Domain": [115] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [115] }, "Oil and Gas Special Act – Federal Farm Mortgage Corporation " "Act of 1934": { "Acquired Lands": [115] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [115] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [122], "Competitive Pre-Federal Coal Leasing Amendment Act " "(FCLAA) Leases": [122], "Competitive Regional Emergency/Bypass Leases": [122], "Competitive Regional Leases": [123], "Exchange Leases": [123], "Preference Right Leases": [123] }, "Coal Licenses": { "Exploration Licenses": [124], "Licenses to Mine": [124] }, "Logical Mining Units": { "None": [124] }, "Combined Hydrocarbon Leases": { "None": [126] }, "Phosphate Leases": { "Phosphate Competitive Leases": [126], "Phosphate Fringe Acreage Noncompetitive Leases": [126], "Phosphate Preference Right Leases": [126] }, "Phosphate Use Permits": { "None": [127] }, "Sodium Leases": { "Sodium Competitive Leases": [127], "Sodium Fringe Acreage Noncompetitive Leases": [127], "Sodium Preference Right Leases": [127] }, "Sodium Use Permit": { "None": [127] }, "Potassium Leases": { "Potassium Competitive Leases": [128], "Potassium Fringe Acreage Noncompetitive Leases": [128], "Potassium Preference Right Leases": [128] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [128], "Gilsonite Fringe Acreage Noncompetitive Lease": [129], "Gilsonite Preference Right Leases": [129] }, "Oil Shale Leases": { "Oil Shale R, D&D Leases": [129] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [130] }, "Asphalt Competitive Leases": { "None": [130] } } competitive_page_numbers = [100, 101, 102] no_header_page_numbers = [123, 129] elif year == "2011": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [111], "Acquired Lands": [111, 112] }, "Pre-Reform Act Future Interest Leases": { "Public Domain and Acquired Lands": [113, 122] }, "Reform Act Leases": { "Public Domain": [113, 123], "Acquired Lands": [123, 124] }, "Reform Act Leases—continued": { "Acquired Lands": [114] }, "Competitive General Services Administration (GSA) " "Oil and Gas Leases": { "Public Domain": [116] }, "Competitive Protective Leases": { "Public Domain and Acquired Lands": [116] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [116] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [116] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [117] }, "EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [117] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [119], "Acquired Lands": [119] }, "Pre-Reform Act Simultaneous Leases—continued": { "Acquired Lands": [120, 121] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain and Acquired Lands": [122] }, "Reform Act Future Interest Leases": { "Acquired Lands": [125] }, "Geothermal Leases": { "Public Domain and Acquired Lands": [125] }, "Private Leases": { "Acquired Lands": [126] }, "Exchange Leases": { "Public Domain": [126] }, "Renewal Leases": { "Public Domain": [126, 127] }, "Class III Reinstatement Leases": { "Public Domain": [127] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [127, 128] }, "Oil and Gas Special Act – Federal Farm Mortgage " "Corporation Act of 1934": { "Acquired Lands": [128] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [128] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [135], "Competitive Pre-Federal Coal Leasing Amendment " "Act (FCLAA) Leases": [135], "Competitive Regional Emergency/Bypass Leases": [135], "Competitive Regional Leases": [136], "Exchange Leases": [136], "Preference Right Leases": [136] }, "Coal Licenses": { "Exploration Licenses": [137], "Licenses To Mine": [137] }, "Logical Mining Units": { "None": [137] }, "Combined Hydrocarbon Leases": { "None": [139] }, "Phosphate Leases": { "Phosphate Competitive Leases": [139], "Phosphate Fringe Acreage Noncompetitive Leases": [139], "Phosphate Preference Right Leases": [139] }, "Phosphate Use Permits": { "None": [139] }, "Sodium Leases": { "Sodium Competitive Leases": [140], "Sodium Fringe Acreage Noncompetitive Leases": [140], "Sodium Preference Right Leases": [140] }, "Sodium Use Permit": { "None": [140] }, "Potassium Leases": { "Potassium Competitive Leases": [141], "Potassium Fringe Acreage Noncompetitive Leases": [141], "Potassium Preference Right Leases": [141] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [142], "Gilsonite Fringe Acreage Noncompetitive Leases": [142], "Gilsonite Preference Right Leases": [142] }, "Oil Shale RD&D Leases": { "None": [142] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [143] } } competitive_page_numbers = [113, 114] no_header_page_numbers = [136] elif year == "2012": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [108], "Acquired Lands": [108, 109] }, "Pre-Reform Act Future Interest Leases": { "Public Domain and Acquired Lands": [110, 119] }, "Reform Act Leases": { "Public Domain": [110, 120], "Acquired Lands": [110] }, "Reform Act Leases—continued": { "Acquired Lands": [111] }, "Competitive General Services Administration (GSA) " "Oil and Gas Leases": { "Public Domain": [113] }, "Competitive Protective Leases": { "Public Domain and Acquired Lands": [113] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [113] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [113] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [114] }, "EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [114] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [116], "Acquired Lands": [116] }, "Pre-Reform Act Simultaneous Leases": { "Public Domain": [117] }, "Pre-Reform Act Simultaneous Leases—continued": { "Public Domain": [118], "Acquired Lands": [118] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain and Acquired Lands": [119] }, "Reform Act Future Interest Leases": { "Acquired Lands": [122] }, "Geothermal Leases": { "Public Domain and Acquired Lands": [122] }, "Private Leases": { "Acquired Lands": [124] }, "Exchange Leases": { "Public Domain": [124] }, "Renewal Leases": { "Public Domain": [124, 125] }, "Class III Reinstatement Leases": { "Public Domain": [125] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [125, 126] }, "Oil and Gas Special Act – Federal Farm Mortgage Corporation " "Act of 1934": { "Acquired Lands": [126] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [126] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [133], "Competitive Pre-Federal Coal Leasing Amendment Act " "(FCLAA) Leases": [133], "Competitive Regional Emergency/Bypass Leases": [133], "Competitive Regional Leases": [134], "Exchange Leases": [134], "Preference Right Leases": [134] }, "Coal Licenses": { "Exploration Licenses": [135], "Licenses To Mine": [135] }, "Logical Mining Units": { "None": [135] }, "Combined Hydrocarbon Leases": { "None": [137] }, "Phosphate Leases": { "Phosphate Competitive Leases": [137], "Phosphate Fringe Acreage Noncompetitive Leases": [137], "Phosphate Preference Right Leases": [137] }, "Phosphate Use Permits": { "None": [137] }, "Sodium Leases": { "Sodium Competitive Leases": [138], "Sodium Fringe Acreage Noncompetitive Leases": [138], "Sodium Preference Right Leases": [138] }, "Sodium Use Permit": { "None": [138] }, "Potassium Leases": { "Potassium Competitive Leases": [139], "Potassium Fringe Acreage Noncompetitive Leases": [139], "Potassium Preference Right Leases": [139] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [140], "Gilsonite Fringe Acreage Noncompetitive Leases": [140], "Gilsonite Preference Right Leases": [140] }, "Oil Shale RD&D Leases": { "None": [140] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [141] } } competitive_page_numbers = [110, 111] no_header_page_numbers = [134] else: # provide reasoning for failure of parsing data log.error('Missing code specifying sub-headers, ' 'add code to blm_pls_call()') for header in sub_headers: for sub_header in sub_headers[header]: pg = sub_headers[header][sub_header] pdf_pages = [] for page_number in pg: found_header = False pdf_page = tabula.read_pdf(io.BytesIO(resp.content), pages=page_number, stream=True, guess=False)[0] if pdf_page.shape[1] == 1: pdf_page.columns = ["one"] else: pdf_page.columns = ["one", "two"] pdf_page.dropna(subset=["one"], inplace=True) # add col of page number pdf_page['page_no'] = page_number pdf_pages.append(pdf_page) for page in pdf_pages: for index, row in page.iterrows(): if " /" in row["one"]: split_header = row["one"].split(" /") split_row = split_header[0].strip() else: split_row = row["one"] # if page_number in no_header_page_numbers: if row['page_no'] in no_header_page_numbers: # if pages in no_header_page_numbers: found_header = True if split_row == header: found_header = True last_row_header = header if split_row == sub_header and last_row_header == header: copy = True elif sub_header == "None" and last_row_header == header: copy = True if copy and split_row != sub_header and \ split_row != header and found_header: if "FISCAL" in row["one"] or row["one"].isdigit(): skip = True if not skip: if sub_header == "None": sub_header = "" lists = split(row, header, sub_header, next_line) if header in duplicate_headers: # if page_number in competitive_page_numbers: if row['page_no'] in competitive_page_numbers: flow_name.append(f"Competitive {lists[1]}") else: flow_name.append(f"Noncompetitive " f"{lists[1]}") else: flow_name.append(lists[1]) location_str.append(lists[0]) flow_value.append(lists[2]) if next_line: copy = False next_line = False header = "Nothing" if "Total" in row["one"]: row_one_str = "" if any(i.isdigit() for i in row["one"]): # row split based on space row_one_split = row["one"].split(" ") for r in row_one_split: if not any(d.isdigit() for d in r): row_one_str = row_one_str + " " + r else: row_one_str = row["one"] if pdf_page.shape[1] == 1 and \ row["one"] == "Total": next_line = True elif row_one_str.strip() == "Total" or \ "Leases" in row["one"] or "None" in \ row["one"]: number_of_sub_headers = \ number_of_sub_headers + 1 copy = False found_header = False else: next_line = True if sub_header + "—continued" in row["one"]: skip = False df["LocationStr"] = location_str df["ActivityConsumedBy"] = flow_name df["FlowAmount"] = flow_value return df
def allocate_by_sector(df_w_sectors, attr, allocation_method, group_cols, **kwargs): """ Create an allocation ratio for df :param df_w_sectors: df with column of sectors :param attr: dictionary, attributes of activity set :param allocation_method: currently written for 'proportional' and 'proportional-flagged' :param group_cols: columns on which to base aggregation and disaggregation :return: df with FlowAmountRatio for each sector """ # first determine if there is a special case with how # the allocation ratios are created if allocation_method == 'proportional-flagged': # if the allocation method is flagged, subset sectors that are # flagged/notflagged, where nonflagged sectors have flowamountratio=1 if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] flagged = fsm[fsm['disaggregate_flag'] == 1] if flagged['SectorProducedBy'].isna().all(): sector_col = 'SectorConsumedBy' else: sector_col = 'SectorProducedBy' flagged_names = flagged[sector_col].tolist() nonflagged = fsm[fsm['disaggregate_flag'] == 0] nonflagged_names = nonflagged[sector_col].tolist() # subset the original df so rows of data that run through the # proportional allocation process are # sectors included in the flagged list df_w_sectors_nonflagged = df_w_sectors.loc[( df_w_sectors[fbs_activity_fields[0]].isin(nonflagged_names) ) | (df_w_sectors[fbs_activity_fields[1]]. isin(nonflagged_names))].reset_index(drop=True) df_w_sectors_nonflagged = \ df_w_sectors_nonflagged.assign(FlowAmountRatio=1) df_w_sectors = \ df_w_sectors.loc[(df_w_sectors[fbs_activity_fields[0]] .isin(flagged_names)) | (df_w_sectors[fbs_activity_fields[1]] .isin(flagged_names) )].reset_index(drop=True) else: log.error('The proportional-flagged allocation ' 'method requires a column "disaggregate_flag" ' 'in the flow_subset_mapped df') # run sector aggregation fxn to determine total flowamount # for each level of sector if len(df_w_sectors) == 0: return df_w_sectors_nonflagged else: df1 = sector_aggregation(df_w_sectors, group_cols) # run sector disaggregation to capture one-to-one # naics4/5/6 relationships df2 = sector_disaggregation(df1) # if statements for method of allocation # either 'proportional' or 'proportional-flagged' allocation_df = [] if allocation_method in ('proportional', 'proportional-flagged'): allocation_df = proportional_allocation(df2, attr) else: log.error('Must create function for specified ' 'method of allocation') if allocation_method == 'proportional-flagged': # drop rows where values are not in flagged names allocation_df =\ allocation_df.loc[(allocation_df[fbs_activity_fields[0]] .isin(flagged_names)) | (allocation_df[fbs_activity_fields[1]] .isin(flagged_names) )].reset_index(drop=True) # concat the flagged and nonflagged dfs allocation_df = \ pd.concat([allocation_df, df_w_sectors_nonflagged], ignore_index=True).sort_values(['SectorProducedBy', 'SectorConsumedBy']) return allocation_df
def return_activity_from_scale(df, provided_from_scale): """ Determine the 'from scale' used for aggregation/df subsetting for each activity combo in a df :param df: flowbyactivity df :param provided_from_scale: str, The scale to use specified in method yaml :return: df, FBA with column indicating the "from" geoscale to use for each row """ # determine the unique combinations of activityproduced/consumedby unique_activities = unique_activity_names(df) # filter by geoscale fips = create_geoscale_list(df, provided_from_scale) # determine unique activities after subsetting by geoscale unique_activities_sub = \ unique_activity_names(df[df['Location'].isin(fips)]) # return df of the difference between unique_activities # and unique_activities2 df_missing = dataframe_difference(unique_activities, unique_activities_sub, which='left_only') # return df of the similarities between unique_activities # and unique_activities2 df_existing = dataframe_difference(unique_activities, unique_activities_sub, which='both') df_existing = df_existing.drop(columns='_merge') df_existing['activity_from_scale'] = provided_from_scale if len(df_missing) > 0: # for loop through geoscales until find data for each activity combo if provided_from_scale == 'national': geoscales = ['state', 'county'] elif provided_from_scale == 'state': geoscales = ['county'] elif provided_from_scale == 'county': log.error('Missing county level data') for i in geoscales: # filter by geoscale fips_i = create_geoscale_list(df, i) df_i = df[df['Location'].isin(fips_i)] # determine unique activities after subsetting by geoscale unique_activities_i = unique_activity_names(df_i) # return df of the difference between unique_activities subset and # unique_activities for geoscale df_missing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='right_only') df_missing_i = df_missing_i.drop(columns='_merge') df_missing_i['activity_from_scale'] = i # return df of the similarities between unique_activities # and unique_activities2 df_existing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='both') # append unique activities and df with defined activity_from_scale unique_activities_sub = unique_activities_sub.append( df_missing_i[[fba_activity_fields[0], fba_activity_fields[1]]]) df_existing = df_existing.append(df_missing_i) df_missing = dataframe_difference( df_missing[[fba_activity_fields[0], fba_activity_fields[1]]], df_existing_i[[fba_activity_fields[0], fba_activity_fields[1]]], which=None) return df_existing
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: df, FBA format :param source: str, source name :param activitynames: list, activity names in activity set :param kwargs: can be the mapping file and method of allocation :return: df, FBA subset """ # first determine if there are special cases that would modify the # typical method of subset an example of a special case is when the # allocation method is 'proportional-flagged' subset_by_sector_cols = False subset_by_column_value = False if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': subset_by_sector_cols = True if 'activity_set_names' in kwargs: asn = kwargs['activity_set_names'] if asn is not None: if 'allocation_subset_col' in asn: subset_by_column_value = True if check_activities_sector_like(source) is False: # read in source crosswalk df = get_activitytosector_mapping(source) sec_source_name = df['SectorSourceName'][0] df = expand_naics_list(df, sec_source_name) # subset source crosswalk to only contain values # pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in # the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( sector_list)].reset_index(drop=True) else: fba_allocation_subset = \ fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(sector_list) ) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( activitynames)].reset_index(drop=True) elif subset_by_sector_cols: # if it is a special case, then base the subset of data on # sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = \ pd.concat([part1, part2], ignore_index=True).drop_duplicates() modified_activitynames = modified_activitynames[ modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]]. isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]]. isin(modified_activitynames))].reset_index(drop=True) else: fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames) )].reset_index(drop=True) # if activity set names included in function call and activity set names # is not null, then subset data based on value and column specified if subset_by_column_value: # create subset of activity names and allocation subset metrics asn_subset = \ asn[asn['name'].isin(activitynames)].reset_index(drop=True) if asn_subset['allocation_subset'].isna().all(): pass elif asn_subset['allocation_subset'].isna().any(): log.error('Define column and value to subset on in the activity ' 'set csv for all rows') else: col_to_subset = asn_subset['allocation_subset_col'][0] val_to_subset = asn_subset['allocation_subset'][0] # subset fba_allocation_subset further log.debug('Subset the allocation dataset where %s = %s', str(col_to_subset), str(val_to_subset)) fba_allocation_subset = fba_allocation_subset[ fba_allocation_subset[col_to_subset] == val_to_subset].reset_index(drop=True) return fba_allocation_subset