Пример #1
0
def column_names(file_name):
    base_url = 'https://www.sciencebase.gov/catalog/file/get/'
    pacific_region = ['5d407318e4b01d82ce8d9b3c?f=__disk__22%2F5c%2Fe3%2F225'
                      'ce31141477eb0904f38f95f1d472bbe2a2a11',
                      '5d407318e4b01d82ce8d9b3c?f=__disk__2b%2F75%2F2b%2F2b7'
                      '52b0c5decf8e83c035d559a2688c481bb0cfe']
    midwestern = ['5cbf5150e4b09b8c0b700df3?f=__disk__66%2F4f%2Ff2%2F664ff289'
                  '064560bbce748082f7b34593dad49ca2',
                  '5cbf5150e4b09b8c0b700df3?f=__disk__bf%2F73%2F1f%2Fbf731fdf'
                  '4e984a5cf50c0f1a140cda366cb8c1d3']
    northeastern = ['5d4192aee4b01d82ce8da477?f=__disk__c2%2F02%2F06%2Fc202060'
                    '78520c5ec87394a3499eea073f472a27d',
                    '5d4192aee4b01d82ce8da477?f=__disk__b0%2Fb9%2F35%2Fb0b9350'
                    '21a47ccf57f7584cc7f14d82aacc491d1']
    southwestern = ['5f8f1f1282ce06b040efc90e?f=__disk__f8%2Fb8%2Ff9%2Ff8b8f9'
                    'bdc2a07f014ed6dced8feb2dd7bc63e056',
                    '5f8f1f1282ce06b040efc90e?f=__disk__8e%2F8e%2Fb8%2F8e8eb8'
                    '203ea14ab19a45372919a0dbf667d033b2']
    southeastern = ['5d6e70e5e4b0c4f70cf635a1?f=__disk__fb%2Fdb%2F92%2Ffbdb928'
                    '1872069b23bcd134a4c5fa1ddc7280b53',
                    '5d6e70e5e4b0c4f70cf635a1?f=__disk__14%2Fc1%2F63%2F14c1636'
                    'eef91529f548d5fe29ff3f426d3b4b996']
    if file_name in pacific_region:
        legend_name = "5d407318e4b01d82ce8d9b3c?f=__disk__ab%2F27%2F08%2Fab" \
                      "27083f354bd851ec09bc0f33c2dc130f808bb5"
    elif file_name in midwestern:
        legend_name = "5cbf5150e4b09b8c0b700df3?f=__disk__a6%2Ffb%2Fd6%2Fa6f" \
                      "bd6f6bcce874109d2e989d1d4d5a67c33cd49"
    elif file_name in northeastern:
        legend_name = "5d4192aee4b01d82ce8da477?f=__disk__81%2F5d%2F3d%2F815" \
                      "d3deb08f82c1662ff94eb941074ff99c75088"
    elif file_name in southwestern:
        legend_name = "5f8f1f1282ce06b040efc90e?f=__disk__44%2Ff6%2F74%2F44f" \
                      "674b54b2fa571191a597c8dfae0923893d3d3"
    elif file_name in southeastern:
        legend_name = "5d6e70e5e4b0c4f70cf635a1?f=__disk__93%2Fba%2F5c%2F93b" \
                      "a5c50c58ced4116ad2e5b9783fc7848ab2cb5"
    contents = make_url_request(base_url + legend_name)
    xslt_content = contents.content.decode('utf-8')
    root = ET.fromstring(xslt_content)
    label = []
    name = []
    for attr in root.iter('attr'):
        for child in attr:
            if str(child.tag) == 'attrlabl':
                label.append(str(child.text))
            if str(child.tag) == 'attrdef':
                name.append(str(child.text))
    legend = pd.DataFrame()
    legend["label"] = label
    legend["name"] = name
    return legend
Пример #2
0
def get_data_commons_index(file_meta, paths):
    """Returns a dataframe of files available on data commmons for the
    particular category
    :param file_meta: instance of class FileMeta
    :param paths: instance of class Path
    :param category: str of the category to search e.g. 'flowsa/FlowByActivity'
    :return: dataframe with 'date' and 'file_name' as fields
    """
    index_url = '?prefix='
    subdirectory = file_meta.tool + '/'
    if file_meta.category != '':
        subdirectory = subdirectory + file_meta.category + '/'
    url = paths.remote_path + index_url + subdirectory
    listing = make_url_request(url)
    # Code to convert XML to pd df courtesy of
    # https://stackabuse.com/reading-and-writing-xml-files-in-python-with-panda
    contents = ET.XML(listing.text)
    data = []
    cols = []
    for i, child in enumerate(contents):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    df = pd.DataFrame(data)
    df.dropna(inplace=True)
    try:
        # only get first two columns and rename them name and last modified
        df = df[[0, 1]]
    except KeyError:
        # no data found at url
        return None
    df.columns = ['file_name', 'last_modified']
    # Reformat the date to a pd datetime
    df['date'] = pd.to_datetime(df['last_modified'],
                                format='%Y-%m-%dT%H:%M:%S')
    # Remove the category name and trailing slash from the file name
    df['file_name'] = df['file_name'].str.replace(subdirectory, "")
    # Reset the index and return
    df = df[['date', 'file_name']].reset_index(drop=True)
    return df
Пример #3
0
def call_urls(*, url_list, source, year, config):
    """
    This method calls all the urls that have been generated.
    It then calls the processing method to begin processing the returned data.
    The processing method is specific to
    the data source, so this function relies on a function in source.py
    :param url_list: list, urls to call
    :param source: str, data source
    :param year: str, year
    :param config: dictionary, FBA yaml
    :return: list, dfs to concat and parse
    """
# identify if url request requires cookies set
    set_cookies = config.get('allow_http_request_cookies')
    confirm_gdrive = config.get('confirm_gdrive')

    # create dataframes list by iterating through url list
    data_frames_list = []
    if url_list[0] is not None:
        for url in url_list:
            log.info("Calling %s", url)
            resp = make_url_request(url,
                                    set_cookies=set_cookies,
                                    confirm_gdrive=confirm_gdrive)
            if "call_response_fxn" in config:
                # dynamically import and call on function
                df = dynamically_import_fxn(
                    source, config["call_response_fxn"])(resp=resp,
                                                         source=source,
                                                         year=year,
                                                         config=config,
                                                         url=url)
            if isinstance(df, pd.DataFrame):
                data_frames_list.append(df)
            elif isinstance(df, list):
                data_frames_list.extend(df)

    return data_frames_list
Пример #4
0
def download_eGRID(year):
    """Download eGRID files from EPA website."""
    log.info(f'downloading eGRID data for {year}')

    download_url = _config[year]['download_url']
    egrid_file_name = _config[year]['file_name']

    r = make_url_request(download_url)

    # extract .xlsx workbook
    if year == '2016' or year == '2014':
        z = zipfile.ZipFile(io.BytesIO(r.content))
        workbook = z.read(egrid_file_name)
    else:
        workbook = r.content

    # save .xlsx workbook to destination directory
    destination = OUTPUT_PATH.joinpath(egrid_file_name)
    # if destination folder does not already exist, create it
    OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
    with open(destination, 'wb') as output:
        output.write(workbook)
    log.info(f'{egrid_file_name} saved to {OUTPUT_PATH}')
Пример #5
0
def download_from_remote(file_meta, paths, **kwargs):
    """
    Downloads one or more files from remote and stores locally based on the
    most recent instance of that file. All files that share name_data, version,
    and hash will be downloaded together.
    :param file_meta: populated instance of class FileMeta
    :param paths: instance of class Paths
    :param kwargs: option to include 'subdirectory_dict', a dictionary that
         directs local data storage location based on extension
    """
    base_url = paths.remote_path + file_meta.tool + '/'
    if file_meta.category != '':
        base_url = base_url + file_meta.category + '/'
    files = get_most_recent_from_index(file_meta, paths)
    if files == []:
        log.info('%s not found in %s', file_meta.name_data, base_url)
    else:
        for f in files:
            url = base_url + f
            r = make_url_request(url)
            if r is not None:
                # set subdirectory
                subdirectory = file_meta.category
                # if there is a dictionary with specific subdirectories
                # based on end of filename, modify the subdirectory
                if kwargs != {}:
                    if 'subdirectory_dict' in kwargs:
                        for k, v in kwargs['subdirectory_dict'].items():
                            if f.endswith(k):
                                subdirectory = v
                folder = os.path.realpath(paths.local_path
                                          + '/' + subdirectory)
                file = folder + "/" + f
                create_paths_if_missing(file)
                log.info('%s saved to %s', f, folder)
                with open(file, 'wb') as f:
                    f.write(r.content)
Пример #6
0
def annual_fips(years):
    """Fxn to pull the FIPS codes/names from the Census website. Columns are renamed amd subset."""
    # list of years to include in FIPS crosswalk

    df_list = {}
    for year in years:
        # only works for 2015 +....contacted Census on 5/1 to ask for county level
        # fips for previous years
        if year == '2013':
            url = 'https://www2.census.gov/programs-surveys/popest/geographies/' + \
                  year + '/all-geocodes-v' + year + '.xls'
        else:
            url = "https://www2.census.gov/programs-surveys/popest/geographies/" + \
                  year + "/all-geocodes-v" + year + ".xlsx"

        r = make_url_request(url)
        raw_df = pd.read_excel(io.BytesIO(
            r.content)).dropna().reset_index(drop=True)

        # skip the first few rows
        FIPS_df = pd.DataFrame(raw_df.loc[1:]).reindex()
        # Assign the column titles (remove whitespace if exists and new lines
        FIPS_df.columns = raw_df.loc[0, ].str.replace(' |\\n', '')

        original_cols = FIPS_df.columns

        # Create a dictionary of geographic levels
        geocode_levels = {
            "010": "Country",
            "040": "State",
            "050": "County_" + year
        }
        level_codes = geocode_levels.keys()
        # filter df for records with the levels of interest
        FIPS_df = FIPS_df.loc[FIPS_df["SummaryLevel"].isin(level_codes)]

        # split df by level to return a list of dfs
        # use a list comprehension to split it out
        FIPS_bylevel = [
            pd.DataFrame(y)
            for x, y in FIPS_df.groupby("SummaryLevel", as_index=False)
        ]

        # Assume df order in list is in geolevels keys order

        # country does not have its own field
        state_and_county_fields = {
            "Country": ["StateCode(FIPS)"],
            "State": ["StateCode(FIPS)"],
            "County_" + year: ["StateCode(FIPS)", "CountyCode(FIPS)"]
        }

        name_field = "AreaName(includinglegal/statisticalareadescription)"

        new_dfs = {}
        for df in FIPS_bylevel:
            df = df.reset_index(drop=True)
            level = geocode_levels[df.loc[0, "SummaryLevel"]]
            new_df = df[original_cols]
            new_df = new_df.rename(columns={name_field: level})
            fields_to_keep = [str(x) for x in state_and_county_fields[level]]
            fields_to_keep.append(level)
            new_df = new_df[fields_to_keep]
            # Write each to the list
            new_dfs[level] = new_df

        # New merge the new dfs to add the info
        # FIPS_df_new = FIPS_df
        for k, v in new_dfs.items():
            fields_to_merge = [str(x) for x in state_and_county_fields[k]]
            # FIPS_df_new = pd.merge(FIPS_df_new,v,on=fields_to_merge,how="left")
            FIPS_df = pd.merge(FIPS_df, v, on=fields_to_merge, how="left")

        # combine state and county codes
        FIPS_df['FIPS_' + year] = \
            FIPS_df[state_and_county_fields["County_" + year][0]].astype(str) + \
            FIPS_df[state_and_county_fields["County_" + year][1]].astype(str)

        fields_to_keep = ["State", "County_" + year, "FIPS_" + year]
        FIPS_df = FIPS_df[fields_to_keep]

        # Clean the county field - remove the " County"
        # FIPS_df["County"] = FIPS_df["County"].apply(lambda x:stripcounty(x))
        FIPS_df["County_" + year] = FIPS_df["County_" +
                                            year].apply(stripcounty)
        FIPS_df["County_" +
                year] = FIPS_df["County_" +
                                year].apply(clean_str_and_capitalize)
        FIPS_df["State"] = FIPS_df["State"].apply(clean_str_and_capitalize)

        # add to data dictionary of fips years
        df_list["FIPS_" + year] = FIPS_df
    return df_list
Пример #7
0
Script creates crosswalks for Land and Water

"""

import io
import pandas as pd
from esupy.remote import make_url_request
from flowsa.settings import datapath
from flowsa.data_source_scripts.EIA_CBECS_Land import standardize_eia_cbecs_land_activity_names

if __name__ == '__main__':

    # url for excel crosswalk
    url = 'http://www.eia.gov/consumption/commercial/data/archive/cbecs/PBAvsNAICS.xls'
    # make url requestl, as defined in common.py
    r = make_url_request(url)
    # Convert response to dataframe, skipping first three rows
    df_raw = pd.read_excel(io.BytesIO(r.content), skiprows=3)

    # Rename first column to sector (naics 2002)
    df = df_raw.rename(columns={df_raw.columns[0]: "Sector"})

    # remove row of just NAs
    df = df[df['Sector'].notna()]

    # remove description in first column
    df['Sector'] = df['Sector'].str.split('/').str[0]

    # reshape data to long format and name columns
    df = pd.melt(df, id_vars=['Sector'])
    df.columns = ['Sector', 'Activity', 'value']
Пример #8
0
# write_Larson_UrbanPublicParks_SI.py (scripts)
# !/usr/bin/env python3
# coding=utf-8
"""
Load and save the SI parks data from

    Larson LR, Jennings V, Cloutier SA (2016) Public Parks and
    Wellbeing in Urban Areas of the United States.
    PLoS ONE 11(4): e0153211. https://doi.org/10.1371/journal.pone.0153211

SI obtained 08/26/2020
"""

import io
import pandas as pd
from esupy.remote import make_url_request
from flowsa.settings import externaldatapath

# 2012--2018 fisheries data at state level
csv_load = "https://doi.org/10.1371/journal.pone.0153211.s001"

if __name__ == '__main__':

    response = make_url_request(csv_load)
    # Read directly into a pandas df
    raw_df = pd.read_excel(io.BytesIO(
        response.content)).dropna().reset_index(drop=True)
    # save data to csv
    raw_df.to_csv(externaldatapath + "Larson_UrbanPublicParks_SI.csv",
                  index=False)