示例#1
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = 'all'
        log.info("Retrieving flowbyactivity for datasource " + k +
                 " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']],
                                            years=[v['year']],
                                            datasource=k,
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__],
                           v["FBS_datapull_fxn"])(*v['parameters'])
    else:
        log.error("Data format not specified in method file for datasource " +
                  k)

    return flows_df
示例#2
0
def store_flowbysector(fbs_df, parquet_name):
    """Prints the data frame into a parquet file."""
    f = fbsoutputpath + parquet_name + '.parquet'
    try:
        fbs_df.to_parquet(f)
    except:
        log.error('Failed to save ' + parquet_name + ' file.')
示例#3
0
def getFlowBySector(methodname):
    """
    Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class
    :return: dataframe in flow by sector format
    """
    fbs = pd.DataFrame()
    # first try reading parquet from your local repo
    try:
        log.info('Loading ' + methodname + ' parquet from local repository')
        fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet")
    except (OSError, FileNotFoundError):
        # if parquet does not exist in local repo, read file from Data Commons
        try:
            log.info(
                methodname +
                ' parquet not found in local repo, loading from Data Commons')
            fbs = pd.read_parquet(
                'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/'
                + methodname + ".parquet")
        except FileNotFoundError:
            log.error("No parquet file found for datasource " + methodname +
                      " in flowsa or Data Commons")

    return fbs
示例#4
0
def collapse_activity_fields(df):
    """
    The 'activityconsumedby' and 'activityproducedby' columns from the
    allocation dataset do not always align with
    the dataframe being allocated. Generalize the allocation activity column.
    :param df: df, FBA used to allocate another FBA
    :return: df, single Activity column
    """

    df = replace_strings_with_NoneType(df)

    activity_consumed_list = df['ActivityConsumedBy'].drop_duplicates().values.tolist()
    activity_produced_list = df['ActivityProducedBy'].drop_duplicates().values.tolist()

    # if an activity field column is all 'none', drop the column and
    # rename renaming activity columns to generalize
    if all(v is None for v in activity_consumed_list):
        df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy'])
        df = df.rename(columns={'ActivityProducedBy': 'Activity',
                                'SectorProducedBy': 'Sector'})
    elif all(v is None for v in activity_produced_list):
        df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy'])
        df = df.rename(columns={'ActivityConsumedBy': 'Activity',
                                'SectorConsumedBy': 'Sector'})
    else:
        log.error('Cannot generalize dataframe')

    # drop other columns
    df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType'])

    return df
示例#5
0
def getFlowByActivity(flowclass, years, datasource):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :return: a pandas DataFrame in FlowByActivity format
    """
    fbas = pd.DataFrame()
    for y in years:
        # first try reading parquet from your local repo
        try:
            log.info('Loading ' + datasource + ' ' + str(y) +' parquet from local repository')
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except (OSError, FileNotFoundError):
            # if parquet does not exist in local repo, read file from Data Commons
            try:
                log.info(datasource + ' parquet not found in local repo, loading from Data Commons')
                fba = pd.read_parquet('https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' +
                                      datasource + "_" + str(y) + '.parquet')
                fba = fba[fba['Class'].isin(flowclass)]
                fbas = pd.concat([fbas, fba], sort=False)
            except FileNotFoundError:
                log.error("No parquet file found for datasource " + datasource + "and year " + str(
                    y) + " in flowsa or Data Commons")
    return fbas
示例#6
0
def getFlowByActivity(datasource,
                      year,
                      flowclass=None,
                      geographic_level=None,
                      download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Retrieves stored data in the FlowByActivity format
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
                             Optional. E.g. 'national', 'state', 'county'.
    :param download_if_missing: bool, if True will attempt to load from remote server
        prior to generating if file not found locally
    :return: a pandas DataFrame in FlowByActivity format
    """
    from esupy.processed_data_mgmt import download_from_remote
    # Set fba metadata
    name = flowsa.flowbyactivity.set_fba_name(datasource, year)
    fba_meta = set_fb_meta(name, "FlowByActivity")

    # Try to load a local version of fba; generate and load if missing
    fba = load_preprocessed_output(fba_meta, paths)
    # Remote download
    if fba is None and download_if_missing:
        log.info('%s %s not found in %s, downloading from remote source',
                 datasource, str(year), fbaoutputpath)
        download_from_remote(fba_meta, paths)
        fba = load_preprocessed_output(fba_meta, paths)

    if fba is None:
        log.info('%s %s not found in %s, running functions to generate FBA',
                 datasource, str(year), fbaoutputpath)
        # Generate the fba
        flowsa.flowbyactivity.main(year=year, source=datasource)
        # Now load the fba
        fba = load_preprocessed_output(fba_meta, paths)
        if fba is None:
            log.error('getFlowByActivity failed, FBA not found')
        else:
            log.info('Loaded %s %s from %s', datasource, str(year),
                     fbaoutputpath)
    else:
        log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath)

    # Address optional parameters
    if flowclass is not None:
        fba = fba[fba['Class'] == flowclass]
    # if geographic level specified, only load rows in geo level
    if geographic_level is not None:
        fba = filter_by_geoscale(fba, geographic_level)
    return fba
示例#7
0
def load_method(method_name):
    """
    Loads a flowbysector method from a YAML
    :param method_name: str, FBS method name (ex. 'Water_national_m1_2015')
    :return: dictionary, items in the FBS method yaml
    """
    sfile = flowbysectormethodpath + method_name + '.yaml'
    try:
        with open(sfile, 'r') as f:
            method = yaml.safe_load(f)
    except IOError:
        log.error("FlowBySector method file not found.")
    return method
示例#8
0
def getFlowBySector(methodname):
    """
    Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class
    :return: dataframe in flow by sector format
    """
    fbs = pd.DataFrame()
    try:
        fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet")
    except FileNotFoundError:
        log.error("No parquet file found for datasource " + methodname +
                  " in flowsa")
    return fbs
示例#9
0
def load_method(method_name):
    """
    Loads a flowbysector method from a YAML
    :param method_name:
    :return:
    """
    sfile = flowbysectormethodpath + method_name + '.yaml'
    try:
        with open(sfile, 'r') as f:
            method = yaml.safe_load(f)
    except IOError:
        log.error("FlowBySector method file not found.")
    return method
示例#10
0
def filter_by_geoscale(df, geoscale):
    """
    Filter flowbyactivity by FIPS at the given scale
    :param df: Either flowbyactivity or flowbysector
    :param geoscale: string, either 'national', 'state', or 'county'
    :return: filtered flowbyactivity or flowbysector
    """

    fips = create_geoscale_list(df, geoscale)

    df = df[df['Location'].isin(fips)]

    if len(df) == 0:
        log.error("No flows found in the "  + " flow dataset at the " + geoscale + " scale.")
    else:
        return df
示例#11
0
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \
    specify 'national', 'state', 'county'
    :return: a pandas DataFrame in FlowByActivity format
    """

    fbas = pd.DataFrame()
    for y in years:
        # first try reading parquet from your local repo
        try:
            log.info('Loading ' + datasource + ' ' + str(y) +
                     ' parquet from local repository')
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) +
                                  ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except (OSError, FileNotFoundError):
            # if parquet does not exist in local repo, read file from Data Commons
            try:
                log.info(
                    datasource +
                    ' parquet not found in local repo, loading from Data Commons'
                )
                fba = pd.read_parquet(
                    'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/'
                    + datasource + "_" + str(y) + '.parquet')
                fba = fba[fba['Class'].isin(flowclass)]
                fbas = pd.concat([fbas, fba], sort=False)
            except FileNotFoundError:
                log.error("No parquet file found for datasource " +
                          datasource + "and year " + str(y) +
                          " in flowsa or Data Commons")

    # if geographic level specified, only load rows in geo level
    if geographic_level != 'all':
        fbas = filter_by_geoscale(fbas, geographic_level)

    return fbas
示例#12
0
def load_file(datafile, local_file, remote_file):
    """
    Loads a preprocessed file
    :param datafile: a data file name with any preceeding relative file
    :param paths: instance of class Paths
    :return: a pandas dataframe of the datafile
    """
    if os.path.exists(local_file):
        log.info('Loading ' + datafile + ' from local repository')
        df = pd.read_parquet(local_file)
    else:
        try:
            log.info(
                datafile +
                ' not found in local folder; loading from remote server...')
            df = pd.read_parquet(remote_file)
        except FileNotFoundError:
            log.error("No file found for " + datafile)
    return df
示例#13
0
def getFlowByActivity(flowclass, years, datasource):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :return: a pandas DataFrame in FlowByActivity format
    """
    fbas = pd.DataFrame()
    for y in years:
        try:
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) +
                                  ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except FileNotFoundError:
            log.error("No parquet file found for datasource " + datasource +
                      "and year " + str(y) + " in flowsa")
    return fbas
示例#14
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters'])
    else:
        log.error("No parquet file found for datasource " + k)

    return flows_df
示例#15
0
def getFlowBySector(methodname,
                    download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Loads stored FlowBySector output or generates it if it doesn't exist, then loads
    :param methodname: string, Name of an available method for the given class
    :param download_if_missing: bool, if True will attempt to load from remote server
        prior to generating if file not found locally
    :return: dataframe in flow by sector format
    """
    from esupy.processed_data_mgmt import download_from_remote

    fbs_meta = set_fb_meta(methodname, "FlowBySector")
    fbs = load_preprocessed_output(fbs_meta, paths)

    # Remote download
    if fbs is None and download_if_missing:
        log.info('%s not found in %s, downloading from remote source',
                 methodname, fbsoutputpath)
        # download and load the FBS parquet
        subdirectory_dict = {'.log': 'Log'}
        download_from_remote(fbs_meta,
                             paths,
                             subdirectory_dict=subdirectory_dict)
        fbs = load_preprocessed_output(fbs_meta, paths)

    # If remote download not specified and no FBS, generate the FBS
    if fbs is None:
        log.info('%s not found in %s, running functions to generate FBS',
                 methodname, fbsoutputpath)
        # Generate the fba
        flowsa.flowbysector.main(method=methodname)
        # Now load the fba
        fbs = load_preprocessed_output(fbs_meta, paths)
        if fbs is None:
            log.error('getFlowBySector failed, FBS not found')
        else:
            log.info('Loaded %s from %s', methodname, fbsoutputpath)
    else:
        log.info('Loaded %s from %s', methodname, fbsoutputpath)
    return fbs
示例#16
0
def blm_pls_call(**kwargs):
    """
    Convert response for calling url to pandas dataframe, begin parsing df into FBA format
    :param kwargs: potential arguments include:
                   url: string, url
                   response_load: df, response from url call
                   args: dictionary, arguments specified when running
                   flowbyactivity.py ('year' and 'source')
    :return: pandas dataframe of original source data
    """
    # load arguments necessary for function
    response_load = kwargs['r']
    args = kwargs['args']

    df = pd.DataFrame()
    sub_headers = {}

    skip = False
    last_row_header = ""
    next_line = False
    copy = False
    location_str = []
    flow_value = []
    flow_name = []
    number_of_sub_headers = 0

    duplicate_headers = [
        "Pre-Reform Act Future Interest Leases", "Reform Act Leases",
        "Reform Act Future Interest Leases"
    ]

    if args["year"] == "2007":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [99],
                "Acquired Lands": [99]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain & Acquired Lands": [100, 109, 110]
            },
            "Reform Act Leases": {
                "Public Domain": [101, 110],
                "Acquired Lands": [101, 102]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [111]
            },
            "Reform Act Future Interest Leases": {
                "Public Domain & Acquired Lands": [103],
                "Acquired Lands": [112]
            },
            "Competitive General Services Administration (GSA) Oil & Gas Leases":
            {
                "Public Domain": [103]
            },
            "Competitive Protective Leases": {
                "Public Domain & Acquired Lands": [103]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [104]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [104]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain & Acquired Lands": [104]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain & Acquired Lands": [104]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [106],
                "Acquired Lands": [106, 107]
            },
            "Pre-Reform Act Simultaneous Leases": {
                "Acquired Lands": [108, 109]
            },
            "Summary: Pre-Reform Act Simultaneous Leases": {
                "Public Domain & Acquired Lands": [109]
            },
            "Geothermal Leases": {
                "Public Domain & Acquired Lands": [112]
            },
            "Private Leases": {
                "Acquired Lands": [114]
            },
            "Exchange Leases": {
                "Public Domain": [114]
            },
            "Renewal Leases": {
                "Public Domain": [114]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [115]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [115]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934":
            {
                "Acquired Lands": [115]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [115]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [122],
                "Competitive Pre-Federal Coal Leasing"
                "Amendment Act (FCLAA) Leases": [122],
                "Competitive Regional Emergency/Bypass Leases": [122],
                "Competitive Regional Leases": [123],
                "Exchange Leases": [123],
                "Preference Right Leases": [123]
            },
            "Coal Licenses": {
                "Exploration Licenses": [124],
                "Licenses to Mine": [124]
            },
            "Logical Mining Units": {
                "None": [124]
            },
            "Combined Hydrocarbon Leases": {
                "None": [126]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [126],
                "Phosphate Fringe Acreage Noncompetitive Leases": [126],
                "Phosphate Preference Right Leases": [126]
            },
            "Phosphate Use Permits": {
                "None": [127]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [127],
                "Sodium Fringe Acreage Noncompetitive Leases": [127],
                "Sodium Preference Right Leases": [127]
            },
            "Sodium Use Permit": {
                "None": [127]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [128],
                "Potassium Fringe Acreage Noncompetitive Leases": [128],
                "Potassium Preference Right Leases": [128]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [128],
                "Gilsonite Fringe Acreage Noncompetitive Lease": [129],
                "Gilsonite Preference Right Leases": [129]
            },
            "Oil Shale Leases": {
                "Oil Shale R, D&D Leases": [129]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [130]
            },
            "Asphalt Competitive Leases": {
                "None": [130]
            }
        }
        competitive_page_numbers = [100, 101, 102]
        no_header_page_numbers = [123, 129]
    elif args["year"] == "2011":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [111],
                "Acquired Lands": [111, 112]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain and Acquired Lands": [113, 122]
            },
            "Reform Act Leases": {
                "Public Domain": [113, 123],
                "Acquired Lands": [123, 124]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [114]
            },
            "Competitive General Services Administration (GSA) Oil and Gas Leases":
            {
                "Public Domain": [116]
            },
            "Competitive Protective Leases": {
                "Public Domain and Acquired Lands": [116]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [116]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [116]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [117]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [117]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [119],
                "Acquired Lands": [119]
            },
            "Pre-Reform Act Simultaneous Leases—continued": {
                "Acquired Lands": [120, 121]
            },
            "Summary:  Pre-Reform Act Simultaneous Leases": {
                "Public Domain and Acquired Lands": [122]
            },
            "Reform Act Future Interest Leases": {
                "Acquired Lands": [125]
            },
            "Geothermal Leases": {
                "Public Domain and Acquired Lands": [125]
            },
            "Private Leases": {
                "Acquired Lands": [126]
            },
            "Exchange Leases": {
                "Public Domain": [126]
            },
            "Renewal Leases": {
                "Public Domain": [126, 127]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [127]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [127, 128]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934":
            {
                "Acquired Lands": [128]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [128]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [135],
                "Competitive Pre-Federal Coal Leasing Amendment Act (FCLAA) Leases":
                [135],
                "Competitive Regional Emergency/Bypass Leases": [135],
                "Competitive Regional Leases": [136],
                "Exchange Leases": [136],
                "Preference Right Leases": [136]
            },
            "Coal Licenses": {
                "Exploration Licenses": [137],
                "Licenses To Mine": [137]
            },
            "Logical Mining Units": {
                "None": [137]
            },
            "Combined Hydrocarbon Leases": {
                "None": [139]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [139],
                "Phosphate Fringe Acreage Noncompetitive Leases": [139],
                "Phosphate Preference Right Leases": [139]
            },
            "Phosphate Use Permits": {
                "None": [139]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [140],
                "Sodium Fringe Acreage Noncompetitive Leases": [140],
                "Sodium Preference Right Leases": [140]
            },
            "Sodium Use Permit": {
                "None": [140]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [141],
                "Potassium Fringe Acreage Noncompetitive Leases": [141],
                "Potassium Preference Right Leases": [141]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [142],
                "Gilsonite Fringe Acreage Noncompetitive Leases": [142],
                "Gilsonite Preference Right Leases": [142]
            },
            "Oil Shale RD&D Leases": {
                "None": [142]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [143]
            }
        }
        competitive_page_numbers = [113, 114]
        no_header_page_numbers = [136]
    elif args["year"] == "2012":
        sub_headers = {
            "Oil and Gas Pre-Reform Act Leases": {
                "Public Domain": [108],
                "Acquired Lands": [108, 109]
            },
            "Pre-Reform Act Future Interest Leases": {
                "Public Domain and Acquired Lands": [110, 119]
            },
            "Reform Act Leases": {
                "Public Domain": [110, 120],
                "Acquired Lands": [110]
            },
            "Reform Act Leases—continued": {
                "Acquired Lands": [111]
            },
            "Competitive General Services Administration (GSA) Oil and Gas Leases":
            {
                "Public Domain": [113]
            },
            "Competitive Protective Leases": {
                "Public Domain and Acquired Lands": [113]
            },
            "Competitive National Petroleum Reserve—Alaska Leases": {
                "Public Domain": [113]
            },
            "Competitive Naval Oil Shale Reserve Leases": {
                "Public Domain": [113]
            },
            "Pre-EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [114]
            },
            "EPAct Competitive Geothermal Leases": {
                "Public Domain and Acquired Lands": [114]
            },
            "Oil and Gas Pre-Reform Act Over-the-Counter Leases": {
                "Public Domain": [116],
                "Acquired Lands": [116]
            },
            "Pre-Reform Act Simultaneous Leases": {
                "Public Domain": [117]
            },
            "Pre-Reform Act Simultaneous Leases—continued": {
                "Public Domain": [118],
                "Acquired Lands": [118]
            },
            "Summary: Pre-Reform Act Simultaneous Leases": {
                "Public Domain and Acquired Lands": [119]
            },
            "Reform Act Future Interest Leases": {
                "Acquired Lands": [122]
            },
            "Geothermal Leases": {
                "Public Domain and Acquired Lands": [122]
            },
            "Private Leases": {
                "Acquired Lands": [124]
            },
            "Exchange Leases": {
                "Public Domain": [124]
            },
            "Renewal Leases": {
                "Public Domain": [124, 125]
            },
            "Class III Reinstatement Leases": {
                "Public Domain": [125]
            },
            "Oil and Gas Special Act – Rights-of-Way of 1930": {
                "Public Domain": [125, 126]
            },
            "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934":
            {
                "Acquired Lands": [126]
            },
            "Oil and Gas Special Act – Texas Relinquishment Act of 1919": {
                "Acquired Lands": [126]
            },
            "Federal Coal Leases": {
                "Competitive Nonregional Lease-by-Application Leases": [133],
                "Competitive Pre-Federal Coal Leasing Amendment Act (FCLAA) Leases":
                [133],
                "Competitive Regional Emergency/Bypass Leases": [133],
                "Competitive Regional Leases": [134],
                "Exchange Leases": [134],
                "Preference Right Leases": [134]
            },
            "Coal Licenses": {
                "Exploration Licenses": [135],
                "Licenses To Mine": [135]
            },
            "Logical Mining Units": {
                "None": [135]
            },
            "Combined Hydrocarbon Leases": {
                "None": [137]
            },
            "Phosphate Leases": {
                "Phosphate Competitive Leases": [137],
                "Phosphate Fringe Acreage Noncompetitive Leases": [137],
                "Phosphate Preference Right Leases": [137]
            },
            "Phosphate Use Permits": {
                "None": [137]
            },
            "Sodium Leases": {
                "Sodium Competitive Leases": [138],
                "Sodium Fringe Acreage Noncompetitive Leases": [138],
                "Sodium Preference Right Leases": [138]
            },
            "Sodium Use Permit": {
                "None": [138]
            },
            "Potassium Leases": {
                "Potassium Competitive Leases": [139],
                "Potassium Fringe Acreage Noncompetitive Leases": [139],
                "Potassium Preference Right Leases": [139]
            },
            "Gilsonite Leases": {
                "Gilsonite Competitive Leases": [140],
                "Gilsonite Fringe Acreage Noncompetitive Leases": [140],
                "Gilsonite Preference Right Leases": [140]
            },
            "Oil Shale RD&D Leases": {
                "None": [140]
            },
            "Hardrock – Acquired Lands Leases": {
                "Hardrock Preference Right Leases": [141]
            }
        }
        competitive_page_numbers = [110, 111]
        no_header_page_numbers = [134]
    else:
        # provide reasoning for failure of parsing data
        log.error(
            'Missing code specifying sub-headers, add code to blm_pls_call()')

    for header in sub_headers:
        for sub_header in sub_headers[header]:
            pg = sub_headers[header][sub_header]
            pdf_pages = []
            for page_number in pg:
                found_header = False

                pdf_page = \
                tabula.read_pdf(io.BytesIO(response_load.content),
                                pages=page_number, stream=True, guess=False, )[0]

                if pdf_page.shape[1] == 1:
                    pdf_page.columns = ["one"]
                else:
                    pdf_page.columns = ["one", "two"]

                pdf_page.dropna(subset=["one"], inplace=True)
                # add col of page number
                pdf_page['page_no'] = page_number
                pdf_pages.append(pdf_page)

            for page in pdf_pages:
                for index, row in page.iterrows():
                    if " /" in row["one"]:
                        split_header = row["one"].split(" /")
                        split_row = split_header[0].strip()
                    else:
                        split_row = row["one"]
                    # if page_number in no_header_page_numbers:
                    if row['page_no'] in no_header_page_numbers:
                        # if pages in no_header_page_numbers:
                        found_header = True
                    if split_row == header:
                        found_header = True
                        last_row_header = header
                    if split_row == sub_header and last_row_header == header:
                        copy = True
                    elif sub_header == "None" and last_row_header == header:
                        copy = True

                    if copy and split_row != sub_header and split_row != header and found_header:
                        if "FISCAL" in row["one"] or row["one"].isdigit():
                            skip = True

                        if not skip:
                            if sub_header == "None":
                                sub_header = ""
                            lists = split(row, header, sub_header, next_line)
                            if header in duplicate_headers:
                                # if page_number in competitive_page_numbers:
                                if row['page_no'] in competitive_page_numbers:
                                    flow_name.append("Competitive " + lists[1])
                                else:
                                    flow_name.append("Noncompetitive " +
                                                     lists[1])
                            else:
                                flow_name.append(lists[1])
                            location_str.append(lists[0])
                            flow_value.append(lists[2])
                            if next_line:
                                copy = False
                                next_line = False
                                header = "Nothing"
                            if "Total" in row["one"]:
                                row_one_str = ""
                                if any(i.isdigit() for i in row["one"]):
                                    #   row split based on space
                                    row_one_split = row["one"].split(" ")
                                    for r in row_one_split:
                                        if not any(d.isdigit() for d in r):
                                            row_one_str = row_one_str + " " + r
                                else:
                                    row_one_str = row["one"]

                                if pdf_page.shape[1] == 1 and row[
                                        "one"] == "Total":
                                    next_line = True
                                elif row_one_str.strip() == "Total" or "Leases" \
                                        in row["one"] or "None" in row["one"]:
                                    number_of_sub_headers = number_of_sub_headers + 1
                                    copy = False
                                    found_header = False
                                # if number_of_sub_headers >= len(sub_headers[item]):
                                #      header = "Nothing"
                                else:
                                    next_line = True

                        #   if "Total" in row["one"]:
                        #       copy = False
                        #        found_header = False
                        if sub_header + "—continued" in row["one"]:
                            skip = False

    df["LocationStr"] = location_str
    df["ActivityConsumedBy"] = flow_name
    df["FlowAmount"] = flow_value

    return df
示例#17
0
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs):
    """
    Subset the fba allocation data based on NAICS associated with activity
    :param fba_allocation: df, FBA format
    :param source: str, source name
    :param activitynames: list, activity names in activity set
    :param kwargs: can be the mapping file and method of allocation
    :return: df, FBA subset
    """
    # first determine if there are special cases that would modify the typical method of subset
    # an example of a special case is when the allocation method is 'proportional-flagged'
    subset_by_sector_cols = False
    subset_by_column_value = False
    if kwargs != {}:
        if 'flowSubsetMapped' in kwargs:
            fsm = kwargs['flowSubsetMapped']
        if 'allocMethod' in kwargs:
            am = kwargs['allocMethod']
            if am == 'proportional-flagged':
                subset_by_sector_cols = True
        if 'activity_set_names' in kwargs:
            asn = kwargs['activity_set_names']
            if asn is not None:
                if 'allocation_subset_col' in asn:
                    subset_by_column_value = True

    # load the source catalog
    cat = load_source_catalog()
    src_info = cat[source]
    if src_info['sector-like_activities'] is False:
        # read in source crosswalk
        df = get_activitytosector_mapping(source)
        sec_source_name = df['SectorSourceName'][0]
        df = expand_naics_list(df, sec_source_name)
        # subset source crosswalk to only contain values pertaining to list of activity names
        df = df.loc[df['Activity'].isin(activitynames)]
        # turn column of sectors related to activity names into list
        sector_list = pd.unique(df['Sector']).tolist()
        # subset fba allocation table to the values in
        # the activity list, based on overlapping sectors
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    sector_list)].reset_index(drop=True)
        else:
            fba_allocation_subset = \
                fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(sector_list)) |
                                   (fba_allocation[fbs_activity_fields[1]].isin(sector_list))]. \
                    reset_index(drop=True)
    else:
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    activitynames)].reset_index(drop=True)
        elif subset_by_sector_cols:
            # if it is a special case, then base the subset of data on
            # sectors in the sector columns, not on activitynames
            fsm_sub = fsm.loc[
                (fsm[fba_activity_fields[0]].isin(activitynames)) |
                (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index(
                    drop=True)
            part1 = fsm_sub[['SectorConsumedBy']]
            part2 = fsm_sub[['SectorProducedBy']]
            part1.columns = ['Sector']
            part2.columns = ['Sector']
            modified_activitynames = pd.concat(
                [part1, part2], ignore_index=True).drop_duplicates()
            modified_activitynames =\
                modified_activitynames[modified_activitynames['Sector'].notnull()]
            modified_activitynames = modified_activitynames['Sector'].tolist()
            fba_allocation_subset = \
                fba_allocation.loc[
                    (fba_allocation[fbs_activity_fields[0]].isin(modified_activitynames)) |
                    (fba_allocation[fbs_activity_fields[1]].isin(modified_activitynames))]. \
                    reset_index(drop=True)

        else:
            fba_allocation_subset =\
                fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(activitynames)) |
                                   (fba_allocation[fbs_activity_fields[1]].isin(activitynames))].\
                    reset_index(drop=True)

    # if activity set names included in function call and activity set names is not null, \
    # then subset data based on value and column specified
    if subset_by_column_value:
        # create subset of activity names and allocation subset metrics
        asn_subset = asn[asn['name'].isin(activitynames)].reset_index(
            drop=True)
        if asn_subset['allocation_subset'].isna().all():
            pass
        elif asn_subset['allocation_subset'].isna().any():
            log.error(
                'Define column and value to subset on in the activity set csv for all rows'
            )
        else:
            col_to_subset = asn_subset['allocation_subset_col'][0]
            val_to_subset = asn_subset['allocation_subset'][0]
            # subset fba_allocation_subset further
            log.debug('Subset the allocation dataset where %s = %s',
                      str(col_to_subset), str(val_to_subset))
            fba_allocation_subset = fba_allocation_subset[
                fba_allocation_subset[col_to_subset] ==
                val_to_subset].reset_index(drop=True)

    return fba_allocation_subset
示例#18
0
def allocation_helper(df_w_sector, attr, method, v):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                                           df_year=attr['helper_source_year'],
                                           flowclass=attr['helper_source_class'],
                                           geoscale_from=attr['helper_from_scale'],
                                           geoscale_to=v['geoscale_to_use'], **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                              how='left',
                              left_on=['Location_tmp', sector_col_to_merge],
                              right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                                                    how='left',
                                                    left_on=[sector_col_to_merge],
                                                    right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(helper_allocation[['Location', 'Sector', 'HelperFlow']],
                              left_on=['Location', sector_col_to_merge],
                              right_on=['Location', 'Sector'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0, so after multiplying,
        # don't have incorrect value associated with new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(modified_fba_allocation,
                                                             sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(Denominator=
                                           modified_fba_allocation.groupby(
                                               ['FlowName', 'ActivityConsumedBy', 'Location',
                                                'disaggregate_flag']
                                           )['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
                            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(FlowAmount=modified_fba_allocation['FlowAmount'] *
                                                      modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                                                  'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = sector_aggregation(modified_fba_allocation,
                                                     fba_mapped_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(attr['allocation_source'],
                                   attr["scale_helper_results"])(modified_fba_allocation,
                                                                 attr)
    return modified_fba_allocation