def generate_state_totals(year): totals = pd.read_csv(RCRA_DATA_PATH.joinpath('RCRA_state_totals.csv')) totals = totals.rename(columns={'Location Name': 'state_name'}) totals = totals[['state_name', year]] totals['FlowAmount_kg'] = totals[year] * USton_kg totals.drop(labels=year, axis=1, inplace=True) state_codes = pd.read_csv(DATA_PATH.joinpath('state_codes.csv'), usecols=['states', 'state_name']) totals = totals.merge(state_codes, on='state_name') totals = totals.rename(columns={'states': 'State'}) filename = DATA_PATH.joinpath(f'RCRAInfo_{year}_StateTotals.csv') totals.to_csv(filename, index=False) # Update validationSets_Sources.csv date_created = time.strptime(time.ctime(os.path.getctime(filename))) date_created = time.strftime('%d-%b-%Y', date_created) validation_dict = { 'Inventory': 'RCRAInfo', #'Version': '', 'Year': year, 'Name': 'Trends Analysis', 'URL': 'https://rcrapublic.epa.gov/rcrainfoweb/action/modules/br/trends/view', 'Criteria': 'Location: State, Metric: Generation, ' 'Generators To Include: All Generators Included In The NBR', 'Date Acquired': date_created, } update_validationsets_sources(validation_dict, date_acquired=True)
def generate_national_totals(year): """Download and process eGRID national totals for validation. Resulting file is stored in repository """ log.info(f'Processing eGRID national totals for validation of {year}') totals_dict = {'USHTIANT': 'Heat', 'USNGENAN': 'Electricity', #'USETHRMO':'Steam', #PLNTYR sheet 'USNOXAN': 'Nitrogen oxides', 'USSO2AN': 'Sulfur dioxide', 'USCO2AN': 'Carbon dioxide', 'USCH4AN': 'Methane', 'USN2OAN': 'Nitrous oxide', } us_totals = extract_eGRID_excel(year, 'US', index='code') us_totals = us_totals[list(totals_dict.keys())] us_totals.rename(columns=totals_dict, inplace=True) us_totals = us_totals.transpose().reset_index() us_totals = us_totals.rename(columns={'index': 'FlowName', 0: 'FlowAmount'}) steam_df = extract_eGRID_excel(year, 'PLNT', index='code') steam_total = steam_df['USETHRMO'].sum() us_totals = us_totals.append({'FlowName': 'Steam', 'FlowAmount': steam_total}, ignore_index=True) flow_compartments = pd.read_csv(eGRID_DATA_DIR .joinpath('eGRID_flow_compartments.csv'), usecols=['FlowName', 'Compartment']) us_totals = us_totals.merge(flow_compartments, how='left', on='FlowName') us_totals.loc[(us_totals['FlowName'] == 'Carbon dioxide') | (us_totals['FlowName'] == 'Sulfur dioxide') | (us_totals['FlowName'] == 'Nitrogen oxides'), 'Unit'] = 'tons' us_totals.loc[(us_totals['FlowName'] == 'Methane') | (us_totals['FlowName'] == 'Nitrous oxide'), 'Unit'] = 'lbs' us_totals.loc[(us_totals['FlowName'] == 'Heat') | (us_totals['FlowName'] == 'Steam'), 'Unit'] = 'MMBtu' us_totals.loc[(us_totals['FlowName'] == 'Electricity'), 'Unit'] = 'MWh' log.info(f'saving eGRID_{year}_NationalTotals.csv to {DATA_PATH}') us_totals = us_totals[['FlowName', 'Compartment', 'FlowAmount', 'Unit']] us_totals.to_csv(DATA_PATH.joinpath(f'eGRID_{year}_NationalTotals.csv'), index=False) # Update validationSets_Sources.csv validation_dict = {'Inventory': 'eGRID', 'Version': _config[year]['file_version'], 'Year': year, 'Name': 'eGRID Data Files', 'URL': _config[year]['download_url'], 'Criteria': 'Extracted from US Total tab, or for ' 'steam, summed from PLNT tab', } update_validationsets_sources(validation_dict)
def download_state_totals_validation(year): """Generate file of state totals downloaded from echo as csv for validation. Annual totals are stored in the repository. """ log.info('generating state totals') # https://echo.epa.gov/trends/loading-tool/get-data/state-statistics url = _config['state_url'].replace("__year__", year) state_csv = pd.read_csv(url, header=2) state_totals = pd.DataFrame() state_totals['state_name'] = state_csv['State'] state_totals['FlowName'] = 'All' state_totals['Compartment'] = 'water' state_totals['Amount'] = state_csv['Total Pollutant Pounds (lb/yr) for Majors'] +\ state_csv['Total Pollutant Pounds (lb/yr) for Non-Majors'] state_totals['Unit'] = 'lb' state_names = states_df[['states', 'state_name']] state_totals = state_totals.merge(state_names, how='left', on='state_name') state_totals.drop(columns=['state_name'], inplace=True) state_totals.dropna(subset=['states'], inplace=True) state_totals.rename(columns={'states': 'State'}, inplace=True) log.info(f'saving DMR_{year}_StateTotals.csv to {DATA_PATH}') state_totals.to_csv(DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv"), index=False) # Update validationSets_Sources.csv validation_dict = {'Inventory': 'DMR', #'Version': '', 'Year': year, 'Name': 'State statistics', 'URL': 'https://echo.epa.gov/trends/loading-tool/' 'get-data/state-statistics', 'Criteria': 'Check totals by state', } update_validationsets_sources(validation_dict)
def generate_national_totals(year): """Generate dataframe of national emissions and save to csv. Requires the chem_release dataset to be downloaded manually prior to running """ filename = TRI_DATA_PATH.joinpath(f'TRI_chem_release_{year}.csv') df = pd.read_csv(filename, header=0) df.replace(',', 0.0, inplace=True) df.replace('.', 0.0, inplace=True) cols = ['Compartment', 'FlowName', 'Unit', 'FlowAmount'] compartments = { 'air': ['Fugitive Air Emissions', 'Point Source Air Emissions'], 'water': ['Surface Water Discharges'], 'soil': [ 'On-site Land Treatment', 'Other On-site Land Disposal', 'Off-site Land Treatment', 'Other Off-site Land Disposal' ] } # remove entries where all values are 0 v = [col for col in df.columns if col != 'Chemical'] df = df.loc[~(df[v] == 0).all(axis=1)] df_National = pd.DataFrame() for compartment, columns in compartments.items(): df_aux = df[['Chemical'] + columns].reset_index(drop=True) for column in columns: df_aux[column] = df_aux[column].str.replace(',', '').astype('float') df_aux['FlowAmount'] = df_aux[columns].sum(axis=1) df_aux.rename(columns={'Chemical': 'FlowName'}, inplace=True) df_aux['Unit'] = 'Pounds' df_aux['Compartment'] = compartment df_National = pd.concat([df_National, df_aux], axis=0, ignore_index=True, sort=True) del df_aux del df df_National['FlowAmount'] = df_National['FlowAmount'].round(3) df_National = df_National[cols] df_National = map_to_fedefl(df_National) if df_National is None: log.warning('Totals not generated') return df_National.sort_values(by=['FlowName', 'Compartment'], inplace=True) log.info(f'saving TRI_{year}_NationalTotals.csv to {DATA_PATH}') df_National.to_csv(DATA_PATH.joinpath(f'TRI_{year}_NationalTotals.csv'), index=False) # Update validationSets_Sources.csv date_created = time.strptime(time.ctime(filename.stat().st_ctime)) date_created = time.strftime('%d-%b-%Y', date_created) validation_dict = { 'Inventory': 'TRI', #'Version': '', 'Year': year, 'Name': 'TRI Explorer', 'URL': 'https://enviro.epa.gov/triexplorer/tri_release.chemical', 'Criteria': 'Year, All of United States, All Chemicals, ' 'All Industries, Details:(Other On-Site Disposal or ' 'Other Releases, Other Off-Site Disposal or Other Releases), ' 'mapped to FEDEFL', 'Date Acquired': date_created, } update_validationsets_sources(validation_dict, date_acquired=True)
def generate_national_totals_validation(validation_table, year): # define filepath for reference data ref_filepath = OUTPUT_PATH.joinpath('GHGRP_reference.csv') m = MetaGHGRP() reference_df = import_or_download_table(ref_filepath, validation_table, year, m) # parse reference dataframe to prepare it for validation reference_df['YEAR'] = reference_df['YEAR'].astype('str') reference_df = reference_df[reference_df['YEAR'] == year] reference_df.reset_index(drop=True, inplace=True) reference_df['FlowAmount'] = reference_df['GHG_QUANTITY'].astype( float) * 1000 # Maintain some flows in CO2e for validation reference_df.loc[reference_df['GAS_CODE'].isin(flows_CO2e), 'FlowAmount'] =\ reference_df['CO2E_EMISSION'].astype(float) * 1000 reference_df.loc[reference_df['GAS_CODE'].isin(flows_CO2e), 'GAS_NAME'] =\ reference_df['GAS_NAME'] + ' (CO2e)' reference_df = reference_df[[ 'FlowAmount', 'GAS_NAME', 'GAS_CODE', 'FACILITY_ID', 'SUBPART_NAME' ]] reference_df.rename(columns={ 'FACILITY_ID': 'FacilityID', 'GAS_NAME': 'FlowName', 'GAS_CODE': 'FlowCode' }, inplace=True) reference_df_agg = reference_df.groupby( ['FlowName', 'FlowCode', 'SUBPART_NAME']).agg({'FlowAmount': ['sum']}) reference_df_agg.reset_index(inplace=True) reference_df_agg.columns = reference_df_agg.columns.droplevel(level=1) # save reference dataframe to network reference_df_agg.to_csv(DATA_PATH + 'GHGRP_' + year + '_NationalTotals.csv', index=False) # Update validationSets_Sources.csv date_created = time.strptime(time.ctime(ref_filepath.stat().st_ctime)) date_created = time.strftime('%d-%b-%Y', date_created) validation_dict = { 'Inventory': 'GHGRP', #'Version':'', 'Year': year, 'Name': 'GHGRP Table V_GHG_EMITTER_SUBPART', 'URL': generate_url(validation_table, report_year='', row_start='', output_ext='CSV'), 'Criteria': '', 'Date Acquired': date_created, } update_validationsets_sources(validation_dict, date_acquired=True)
def generate_national_totals(year): """Download and parse pollutant national totals from 'Facility-level by Pollutant' data downloaded from EPA website. Used for validation. Creates NationalTotals.csv files. :param year : str, Year of NEI data for comparison. """ log.info('Downloading national totals') # generate url based on data year build_url = _config['national_url'] version = _config['national_version'][year] url = build_url.replace('__year__', year) url = url.replace('__version__', version) # make http request r = [] try: r = requests.Session().get(url, verify=False) except requests.exceptions.ConnectionError: log.error(f"URL Connection Error for {url}") try: r.raise_for_status() except requests.exceptions.HTTPError: log.error('Error in URL request!') # extract data from zip archive z = zipfile.ZipFile(io.BytesIO(r.content)) # create a list of files contained in the zip archive znames = z.namelist() znames = [s for s in znames if '.csv' in s] df = pd.DataFrame() # for all of the .csv data files in the .zip archive, # read the .csv files into a dataframe # and concatenate with the master dataframe # captures various column headings across years usecols = [ 'pollutant code', 'pollutant_cd', 'pollutant desc', 'pollutant_desc', 'description', 'total emissions', 'total_emissions', 'emissions uom', 'uom' ] for i in range(len(znames)): headers = pd.read_csv(z.open(znames[i]), nrows=0) cols = [x for x in headers.columns if x in usecols] df = pd.concat([df, pd.read_csv(z.open(znames[i]), usecols=cols)]) # rename columns to match standard format df.columns = ['FlowID', 'FlowName', 'FlowAmount', 'UOM'] # convert LB/TON to KG df['FlowAmount'] = np.where(df['UOM'] == 'LB', df['FlowAmount'] * lb_kg, df['FlowAmount'] * USton_kg) df = df.drop(columns=['UOM']) # sum across all facilities to create national totals df = df.groupby(['FlowID', 'FlowName'])['FlowAmount'].sum().reset_index() # save national totals to .csv df.rename(columns={'FlowAmount': 'FlowAmount[kg]'}, inplace=True) log.info(f'saving NEI_{year}_NationalTotals.csv to {DATA_PATH}') df.to_csv(DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'), index=False) # Update validationSets_Sources.csv validation_dict = { 'Inventory': 'NEI', 'Version': version, 'Year': year, 'Name': 'NEI Data', 'URL': url, 'Criteria': 'Data Summaries tab, Facility-level by ' 'Pollutant zip file download, summed to national level', } update_validationsets_sources(validation_dict)