예제 #1
0
def test_stewi_config():
    from stewi.globals import config
    _config = config()['databases']
    url_list = []

    # RCRAInfo, TRI, DMR
    for inv in ['RCRAInfo', 'TRI', 'DMR']:
        url_list.append(_config[inv]['url'])

    # eGRID
    for k, v in _config['eGRID'].items():
        if isinstance(v, dict) and 'download_url' in v:
            url_list.append(v['download_url'])

    # GHGRP
    ghgrp = _config['GHGRP']
    url_list.extend([
        ghgrp['url'] + u for u in [
            ghgrp['lo_subparts_url'], ghgrp['esbb_subparts_url'],
            ghgrp['data_summaries_url']
        ]
    ])

    url_check = {}
    for url in url_list:
        if url not in url_check.keys():
            url_check[url] = url_is_alive(url)
    error_list = [k for k, v in url_check.items() if not v]
    s = '\n'.join(error_list)
    assert all(url_check.values()), f"error in {s}"
def test_generate_inventories(year):
    for inventory in config()['databases']:
        if SKIP_BROWSER_DOWNLOAD and inventory in requires_browser_download:
            continue
        try:
            generate_inventory(inventory, year)
        except InventoryNotAvailableError as err:
            print(err)
            continue
def test_all_inventory_generation():
    error_list = []
    for inventory in config()['databases']:
        if SKIP_BROWSER_DOWNLOAD and inventory in requires_browser_download:
            continue
        df = stewi.getInventory(inventory, year)
        error = df is None or len(df) == 0
        if error:
            error_list.append(inventory)
    assert len(
        error_list) == 0, f"Generation of {','.join(error_list)} unsuccessful"
예제 #4
0
import argparse
import urllib
import time
from pathlib import Path

from stewi.globals import unit_convert,\
    DATA_PATH, lb_kg, write_metadata, get_reliability_table_for_source,\
    log, compile_source_metadata, config, store_inventory, set_stewi_meta,\
    paths, read_source_metadata, aggregate
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result
from stewi.filter import filter_states, filter_config
import stewi.exceptions


_config = config()['databases']['DMR']
DMR_DATA_PATH = DATA_PATH / 'DMR'
EXT_DIR = 'DMR Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)

states_df = pd.read_csv(DATA_PATH.joinpath('state_codes.csv'))
STATES = list(states_df['states']) + list(states_df['dc']) +\
    list(states_df['territories'])
STATES = tuple(x for x in STATES if str(x) != 'nan')

# Values used for StEWI query
PARAM_GROUP = True
DETECTION = 'HALF'
ESTIMATION = True

예제 #5
0
import io
import argparse
import re
from pathlib import Path

from stewi.globals import unit_convert, DATA_PATH, set_stewi_meta,\
    get_reliability_table_for_source, write_metadata, url_is_alive,\
    lb_kg, g_kg, config, store_inventory, log, paths, compile_source_metadata,\
    read_source_metadata, aggregate
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result
import stewi.exceptions

EXT_DIR = 'TRI Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)
_config = config()['databases']['TRI']
TRI_DATA_PATH = DATA_PATH / 'TRI'


def visit(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def link_zip(url, queries, year):
    soup = visit(url)
    TRI_zip_options = {}
    for link in soup.find_all(queries['TRI_year_reported']):
        TRI_zip_options[link.text] = link.get(queries['TRI_zip'])
    return TRI_zip_options[year]
예제 #6
0
# filter.py (stewi)
# !/usr/bin/env python3
# coding=utf-8
"""
Functions to support filtering of processed inventories
"""

import pandas as pd
from stewi.globals import DATA_PATH, config, read_inventory, log
from stewi.formats import StewiFormat

filter_config = config(file='filter.yaml')


def apply_filters_to_inventory(inventory, inventory_acronym, year, filters,
                               download_if_missing=False):
    """Apply one or more filters from a passed list to an inventory dataframe.

    :param inventory: df of stewi inventory of type flowbyfacility or flowbyprocess
    :param inventory_acronym: str of inventory e.g. 'NEI'
    :param year: year as number like 2010
    :param filters: a list of named filters to apply to inventory
    :param download_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: DataFrame of filtered inventory
    """
    if 'filter_for_LCI' in filters:
        for name in filter_config['filter_for_LCI']['filters']:
            if name not in filters:
                filters.append(name)
    compare_to_available_filters(filters)
예제 #7
0
import zipfile
import io
from pathlib import Path

from esupy.remote import make_url_request
from stewi.globals import DATA_PATH, write_metadata,\
    unit_convert, log, MMBtu_MJ, MWh_MJ, config, USton_kg, lb_kg,\
    compile_source_metadata, remove_line_breaks, paths, store_inventory,\
    read_source_metadata, set_stewi_meta, aggregate
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result
from stewi.formats import StewiFormat
import stewi.exceptions


_config = config()['databases']['eGRID']

# set filepath
EXT_DIR = 'eGRID Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)
eGRID_DATA_DIR = DATA_PATH / 'eGRID'


def imp_fields(filename, year):
    """Import list of fields from egrid that are desired for LCI.

    :param filename: str name of csv file
    :param year: str year of egrid inventory
    :return: a list of source fields and a dictionary to stewi fields
    """
    egrid_req_fields_df = pd.read_csv(eGRID_DATA_DIR.joinpath(filename),
예제 #8
0
def Generate_TRI_files_csv(TRIyear, Files):
    _config = config()['databases']['TRI']
    tri_url = _config['url']
    link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear)
    regex = re.compile(
        r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip')
    tri_version = re.search(regex, link_zip_TRI).group(1)
    if not tri_version:
        tri_version = 'last'
    tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt')
    keys = imp_fields(data_dir +
                      'TRI_keys.txt')  # the same function can be used
    import_facility = tri_required_fields[0:10]
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    #Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    #Drop 0 for FlowAmount
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = reliability_table[reliability_table['Source'] ==
                                              'TRI']
    tri_reliability_table.drop('Source', axis=1, inplace=True)
    #Merge with reliability table to get
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    # Fill NAs with 5 for DQI reliability score
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    # Drop unneeded columns
    tri.drop('Basis of Estimate', axis=1, inplace=True)
    tri.drop('Code', axis=1, inplace=True)
    # Replace source info with Context
    source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv'
    source_to_context = pd.read_csv(source_cnxt)
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    # Create a new field to put converted amount in
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    # drop old amount and units
    tri.drop('FlowAmount', axis=1, inplace=True)
    tri.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'},
               inplace=True)
    #Drop release type
    tri.drop('ReleaseType', axis=1, inplace=True)
    #Group by facility, flow and compartment to aggregate different release types
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    # Create a specialized weighted mean function to use for aggregation of reliability
    wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"])
    # Groupby and aggregate with your dictionary:
    tri = tri.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'ReliabilityScore': wm
    })
    tri = tri.reset_index()

    #VALIDATE
    tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear +
                                      '_NationalTotals.csv',
                                      header=0,
                                      dtype={"FlowAmount": np.float})
    tri_national_totals['FlowAmount_kg'] = 0
    tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg',
                                       'Unit', 'Pounds', 0.4535924,
                                       'FlowAmount')
    # drop old amount and units
    tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
    tri_national_totals.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(tri,
                                           tri_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('TRI', TRIyear, validation_result)
    #FLOWS
    flows = tri.groupby(['FlowName', 'CAS',
                         'Compartment']).count().reset_index()
    #stack by compartment
    flowsdf = flows[['FlowName', 'CAS', 'Compartment']]
    flowsdf['FlowID'] = flowsdf['CAS']
    #export chemicals
    #!!!Still needs CAS number and FlowID
    flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv',
                   index=False)
    #FLOW BY FACILITY
    #drop CAS
    tri.drop(columns=['CAS'], inplace=True)
    tri_file_name = 'TRI_' + TRIyear + '.csv'
    tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False)
    #FACILITY
    ##Import and handle TRI facility data
    tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' +
                               TRIyear + '.txt',
                               sep='\t',
                               header=0,
                               usecols=import_facility,
                               error_bad_lines=False,
                               low_memory=False)
    #get unique facilities
    tri_facility_unique_ids = pd.unique(tri_facility['TRIFID'])
    tri_facility_unique_rows = tri_facility.drop_duplicates()
    #Use group by to elimiate additional ID duplicates
    #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID'])
    #tri_facility_final = tri_facility_unique_rows_agg.aggregate()
    tri_facility_final = tri_facility_unique_rows
    #rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude'
    }
    tri_facility_final.rename(columns=TRI_facility_name_crosswalk,
                              inplace=True)
    tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear +
                              '.csv',
                              index=False)
    # Record TRI metadata
    external_dir = set_dir(data_dir + '../../../')
    for file in Files:
        tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt'
        try:
            retrieval_time = os.path.getctime(tri_csv)
        except:
            retrieval_time = time.time()
        tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time)
        tri_metadata['SourceFileName'] = get_relpath(tri_csv)
        tri_metadata['SourceURL'] = tri_url
        tri_metadata['SourceVersion'] = tri_version
        write_metadata('TRI', TRIyear, tri_metadata)
예제 #9
0
    log, store_inventory, compile_source_metadata, read_source_metadata,\
    aggregate, set_stewi_meta
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result
from stewi.filter import apply_filters_to_inventory
import stewi.exceptions

try:
    from selenium import webdriver
    from webdriver_manager.chrome import ChromeDriverManager
except ImportError:
    log.error('Must install selenium and webdriver_manager for RCRAInfo. '
              'See install instructions for optional package '
              'installation or install them indepedently and retry.')

_config = config()['databases']['RCRAInfo']
EXT_DIR = 'RCRAInfo Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)
RCRA_DATA_PATH = DATA_PATH / 'RCRAInfo'
DIR_RCRA_BY_YEAR = OUTPUT_PATH.joinpath('RCRAInfo_by_year')


def waste_description_cleaner(x):
    if ('from br conversion' in x) or (x == 'From 1989 BR data'):
        x = None
    return x


def extracting_files(path_unzip, name):
    with zipfile.ZipFile(path_unzip.joinpath(name + '.zip')) as z:
        z.extractall(path_unzip)
예제 #10
0
import requests
from xml.dom import minidom
import time
import argparse
import warnings
from pathlib import Path

from stewi.globals import download_table, write_metadata, import_table, \
    DATA_PATH, get_reliability_table_for_source, set_stewi_meta, config,\
    store_inventory, paths, log, \
    compile_source_metadata, read_source_metadata, aggregate
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result
from stewi.formats import StewiFormat

_config = config()['databases']['GHGRP']
GHGRP_DATA_PATH = DATA_PATH / 'GHGRP'
EXT_DIR = 'GHGRP Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)

# Flow codes that are reported in validation in CO2e
flows_CO2e = ['PFC', 'HFC', 'Other', 'Very_Short', 'HFE', 'Other_Full']

# define GWPs
# (these values are from IPCC's AR4, which is consistent with GHGRP methodology)
CH4GWP = 25
N2OGWP = 298
HFC23GWP = 14800

# define column groupings
ghgrp_cols = pd.read_csv(GHGRP_DATA_PATH.joinpath('ghgrp_columns.csv'))
예제 #11
0
import facilitymatcher.WriteFRSNAICSforStEWI as write_naics
from esupy.processed_data_mgmt import Paths, load_preprocessed_output,\
    write_df_to_file, write_metadata_to_file, read_source_metadata,\
    download_from_remote
from esupy.util import strip_file_extension

MODULEPATH = Path(__file__).resolve().parent
DATA_PATH = MODULEPATH / 'data'

paths = Paths()
paths.local_path = os.path.realpath(paths.local_path + "/facilitymatcher")
output_dir = paths.local_path
ext_folder = 'FRS Data Files'
FRSpath = paths.local_path + '/' + ext_folder

FRS_config = config(config_path=MODULEPATH)['databases']['FRS']

inventory_to_FRS_pgm_acronymn = FRS_config['program_dictionary']
stewi_inventories = list(inventory_to_FRS_pgm_acronymn.keys())


def set_facilitymatcher_meta(file_name, category):
    """Create a class of esupy FileMeta."""
    facilitymatcher_meta = set_stewi_meta(file_name, category)
    facilitymatcher_meta.tool = "facilitymatcher"
    return facilitymatcher_meta


def download_extract_FRS_combined_national(file=None):
    """Download and extract file from source to local directory."""
    url = FRS_config['url']
def test_chemical_matches():
    assert chemicalmatcher.get_matches_for_StEWI(
        config()['databases'].keys()) is not None
예제 #13
0
import numpy as np
import argparse
import requests
import zipfile
import io
from pathlib import Path

from esupy.processed_data_mgmt import download_from_remote
from esupy.util import strip_file_extension
from stewi.globals import DATA_PATH, write_metadata, USton_kg, lb_kg,\
    log, store_inventory, config, read_source_metadata,\
    paths, aggregate, get_reliability_table_for_source, set_stewi_meta
from stewi.validate import update_validationsets_sources, validate_inventory,\
    write_validation_result

_config = config()['databases']['NEI']
EXT_DIR = 'NEI Data Files'
OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR)
NEI_DATA_PATH = DATA_PATH / 'NEI'


def read_data(year, file):
    """Read NEI data and return a dataframe based on identified columns.

    :param year : str, Year of NEI dataset for identifying field names
    :param file : str, File path containing NEI data (parquet).
    :returns df : DataFrame of NEI data from a single file
        with standardized column names.
    """
    nei_required_fields = pd.read_table(
        NEI_DATA_PATH.joinpath('NEI_required_fields.csv'), sep=',')