Пример #1
0
def generate_state_totals(year):
    totals = pd.read_csv(RCRA_DATA_PATH.joinpath('RCRA_state_totals.csv'))
    totals = totals.rename(columns={'Location Name': 'state_name'})
    totals = totals[['state_name', year]]
    totals['FlowAmount_kg'] = totals[year] * USton_kg
    totals.drop(labels=year, axis=1, inplace=True)
    state_codes = pd.read_csv(DATA_PATH.joinpath('state_codes.csv'),
                              usecols=['states', 'state_name'])
    totals = totals.merge(state_codes, on='state_name')
    totals = totals.rename(columns={'states': 'State'})
    filename = DATA_PATH.joinpath(f'RCRAInfo_{year}_StateTotals.csv')
    totals.to_csv(filename, index=False)

    # Update validationSets_Sources.csv
    date_created = time.strptime(time.ctime(os.path.getctime(filename)))
    date_created = time.strftime('%d-%b-%Y', date_created)
    validation_dict = {
        'Inventory': 'RCRAInfo',
        #'Version': '',
        'Year': year,
        'Name': 'Trends Analysis',
        'URL':
        'https://rcrapublic.epa.gov/rcrainfoweb/action/modules/br/trends/view',
        'Criteria': 'Location: State, Metric: Generation, '
        'Generators To Include: All Generators Included In The NBR',
        'Date Acquired': date_created,
    }
    update_validationsets_sources(validation_dict, date_acquired=True)
Пример #2
0
def generate_national_totals(year):
    """Download and process eGRID national totals for validation.

    Resulting file is stored in repository
    """
    log.info(f'Processing eGRID national totals for validation of {year}')
    totals_dict = {'USHTIANT': 'Heat',
                   'USNGENAN': 'Electricity',
                   #'USETHRMO':'Steam', #PLNTYR sheet
                   'USNOXAN': 'Nitrogen oxides',
                   'USSO2AN': 'Sulfur dioxide',
                   'USCO2AN': 'Carbon dioxide',
                   'USCH4AN': 'Methane',
                   'USN2OAN': 'Nitrous oxide',
                   }

    us_totals = extract_eGRID_excel(year, 'US', index='code')
    us_totals = us_totals[list(totals_dict.keys())]
    us_totals.rename(columns=totals_dict, inplace=True)
    us_totals = us_totals.transpose().reset_index()
    us_totals = us_totals.rename(columns={'index': 'FlowName',
                                          0: 'FlowAmount'})

    steam_df = extract_eGRID_excel(year, 'PLNT', index='code')
    steam_total = steam_df['USETHRMO'].sum()
    us_totals = us_totals.append({'FlowName': 'Steam', 'FlowAmount': steam_total},
                                 ignore_index=True)

    flow_compartments = pd.read_csv(eGRID_DATA_DIR
                                    .joinpath('eGRID_flow_compartments.csv'),
                                    usecols=['FlowName', 'Compartment'])
    us_totals = us_totals.merge(flow_compartments, how='left', on='FlowName')

    us_totals.loc[(us_totals['FlowName'] == 'Carbon dioxide') |
                  (us_totals['FlowName'] == 'Sulfur dioxide') |
                  (us_totals['FlowName'] == 'Nitrogen oxides'),
                  'Unit'] = 'tons'
    us_totals.loc[(us_totals['FlowName'] == 'Methane') |
                  (us_totals['FlowName'] == 'Nitrous oxide'),
                  'Unit'] = 'lbs'
    us_totals.loc[(us_totals['FlowName'] == 'Heat') |
                  (us_totals['FlowName'] == 'Steam'),
                  'Unit'] = 'MMBtu'
    us_totals.loc[(us_totals['FlowName'] == 'Electricity'),
                  'Unit'] = 'MWh'
    log.info(f'saving eGRID_{year}_NationalTotals.csv to {DATA_PATH}')
    us_totals = us_totals[['FlowName', 'Compartment', 'FlowAmount', 'Unit']]
    us_totals.to_csv(DATA_PATH.joinpath(f'eGRID_{year}_NationalTotals.csv'), index=False)

    # Update validationSets_Sources.csv
    validation_dict = {'Inventory': 'eGRID',
                       'Version': _config[year]['file_version'],
                       'Year': year,
                       'Name': 'eGRID Data Files',
                       'URL': _config[year]['download_url'],
                       'Criteria': 'Extracted from US Total tab, or for '
                       'steam, summed from PLNT tab',
                       }
    update_validationsets_sources(validation_dict)
Пример #3
0
def download_state_totals_validation(year):
    """Generate file of state totals downloaded from echo as csv for validation.

    Annual totals are stored in the repository.
    """
    log.info('generating state totals')
    # https://echo.epa.gov/trends/loading-tool/get-data/state-statistics
    url = _config['state_url'].replace("__year__", year)
    state_csv = pd.read_csv(url, header=2)
    state_totals = pd.DataFrame()
    state_totals['state_name'] = state_csv['State']
    state_totals['FlowName'] = 'All'
    state_totals['Compartment'] = 'water'
    state_totals['Amount'] = state_csv['Total Pollutant Pounds (lb/yr) for Majors'] +\
        state_csv['Total Pollutant Pounds (lb/yr) for Non-Majors']
    state_totals['Unit'] = 'lb'
    state_names = states_df[['states', 'state_name']]
    state_totals = state_totals.merge(state_names, how='left',
                                      on='state_name')
    state_totals.drop(columns=['state_name'], inplace=True)
    state_totals.dropna(subset=['states'], inplace=True)
    state_totals.rename(columns={'states': 'State'}, inplace=True)
    log.info(f'saving DMR_{year}_StateTotals.csv to {DATA_PATH}')
    state_totals.to_csv(DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv"),
                        index=False)

    # Update validationSets_Sources.csv
    validation_dict = {'Inventory': 'DMR',
                       #'Version': '',
                       'Year': year,
                       'Name': 'State statistics',
                       'URL': 'https://echo.epa.gov/trends/loading-tool/'
                       'get-data/state-statistics',
                       'Criteria': 'Check totals by state',
                       }
    update_validationsets_sources(validation_dict)
Пример #4
0
def generate_national_totals(year):
    """Generate dataframe of national emissions and save to csv.

    Requires the chem_release dataset to be downloaded manually prior to running
    """
    filename = TRI_DATA_PATH.joinpath(f'TRI_chem_release_{year}.csv')
    df = pd.read_csv(filename, header=0)
    df.replace(',', 0.0, inplace=True)
    df.replace('.', 0.0, inplace=True)
    cols = ['Compartment', 'FlowName', 'Unit', 'FlowAmount']
    compartments = {
        'air': ['Fugitive Air Emissions', 'Point Source Air Emissions'],
        'water': ['Surface Water Discharges'],
        'soil': [
            'On-site Land Treatment', 'Other On-site Land Disposal',
            'Off-site Land Treatment', 'Other Off-site Land Disposal'
        ]
    }
    # remove entries where all values are 0
    v = [col for col in df.columns if col != 'Chemical']
    df = df.loc[~(df[v] == 0).all(axis=1)]
    df_National = pd.DataFrame()
    for compartment, columns in compartments.items():
        df_aux = df[['Chemical'] + columns].reset_index(drop=True)
        for column in columns:
            df_aux[column] = df_aux[column].str.replace(',',
                                                        '').astype('float')
        df_aux['FlowAmount'] = df_aux[columns].sum(axis=1)
        df_aux.rename(columns={'Chemical': 'FlowName'}, inplace=True)
        df_aux['Unit'] = 'Pounds'
        df_aux['Compartment'] = compartment
        df_National = pd.concat([df_National, df_aux],
                                axis=0,
                                ignore_index=True,
                                sort=True)
        del df_aux
    del df
    df_National['FlowAmount'] = df_National['FlowAmount'].round(3)
    df_National = df_National[cols]
    df_National = map_to_fedefl(df_National)
    if df_National is None:
        log.warning('Totals not generated')
        return
    df_National.sort_values(by=['FlowName', 'Compartment'], inplace=True)
    log.info(f'saving TRI_{year}_NationalTotals.csv to {DATA_PATH}')
    df_National.to_csv(DATA_PATH.joinpath(f'TRI_{year}_NationalTotals.csv'),
                       index=False)

    # Update validationSets_Sources.csv
    date_created = time.strptime(time.ctime(filename.stat().st_ctime))
    date_created = time.strftime('%d-%b-%Y', date_created)
    validation_dict = {
        'Inventory':
        'TRI',
        #'Version': '',
        'Year':
        year,
        'Name':
        'TRI Explorer',
        'URL':
        'https://enviro.epa.gov/triexplorer/tri_release.chemical',
        'Criteria':
        'Year, All of United States, All Chemicals, '
        'All Industries, Details:(Other On-Site Disposal or '
        'Other Releases, Other Off-Site Disposal or Other Releases), '
        'mapped to FEDEFL',
        'Date Acquired':
        date_created,
    }
    update_validationsets_sources(validation_dict, date_acquired=True)
Пример #5
0
def generate_national_totals_validation(validation_table, year):
    # define filepath for reference data
    ref_filepath = OUTPUT_PATH.joinpath('GHGRP_reference.csv')

    m = MetaGHGRP()
    reference_df = import_or_download_table(ref_filepath, validation_table,
                                            year, m)

    # parse reference dataframe to prepare it for validation
    reference_df['YEAR'] = reference_df['YEAR'].astype('str')
    reference_df = reference_df[reference_df['YEAR'] == year]
    reference_df.reset_index(drop=True, inplace=True)
    reference_df['FlowAmount'] = reference_df['GHG_QUANTITY'].astype(
        float) * 1000
    # Maintain some flows in CO2e for validation
    reference_df.loc[reference_df['GAS_CODE'].isin(flows_CO2e),
                                  'FlowAmount'] =\
        reference_df['CO2E_EMISSION'].astype(float) * 1000
    reference_df.loc[reference_df['GAS_CODE'].isin(flows_CO2e),
                                  'GAS_NAME'] =\
        reference_df['GAS_NAME'] + ' (CO2e)'

    reference_df = reference_df[[
        'FlowAmount', 'GAS_NAME', 'GAS_CODE', 'FACILITY_ID', 'SUBPART_NAME'
    ]]
    reference_df.rename(columns={
        'FACILITY_ID': 'FacilityID',
        'GAS_NAME': 'FlowName',
        'GAS_CODE': 'FlowCode'
    },
                        inplace=True)
    reference_df_agg = reference_df.groupby(
        ['FlowName', 'FlowCode', 'SUBPART_NAME']).agg({'FlowAmount': ['sum']})
    reference_df_agg.reset_index(inplace=True)
    reference_df_agg.columns = reference_df_agg.columns.droplevel(level=1)
    # save reference dataframe to network
    reference_df_agg.to_csv(DATA_PATH + 'GHGRP_' + year +
                            '_NationalTotals.csv',
                            index=False)

    # Update validationSets_Sources.csv
    date_created = time.strptime(time.ctime(ref_filepath.stat().st_ctime))
    date_created = time.strftime('%d-%b-%Y', date_created)
    validation_dict = {
        'Inventory':
        'GHGRP',
        #'Version':'',
        'Year':
        year,
        'Name':
        'GHGRP Table V_GHG_EMITTER_SUBPART',
        'URL':
        generate_url(validation_table,
                     report_year='',
                     row_start='',
                     output_ext='CSV'),
        'Criteria':
        '',
        'Date Acquired':
        date_created,
    }
    update_validationsets_sources(validation_dict, date_acquired=True)
Пример #6
0
def generate_national_totals(year):
    """Download and parse pollutant national totals from 'Facility-level by
    Pollutant' data downloaded from EPA website. Used for validation.
    Creates NationalTotals.csv files.

    :param year : str, Year of NEI data for comparison.
    """
    log.info('Downloading national totals')

    # generate url based on data year
    build_url = _config['national_url']
    version = _config['national_version'][year]
    url = build_url.replace('__year__', year)
    url = url.replace('__version__', version)

    # make http request
    r = []
    try:
        r = requests.Session().get(url, verify=False)
    except requests.exceptions.ConnectionError:
        log.error(f"URL Connection Error for {url}")
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error('Error in URL request!')

    # extract data from zip archive
    z = zipfile.ZipFile(io.BytesIO(r.content))
    # create a list of files contained in the zip archive
    znames = z.namelist()
    znames = [s for s in znames if '.csv' in s]
    df = pd.DataFrame()
    # for all of the .csv data files in the .zip archive,
    # read the .csv files into a dataframe
    # and concatenate with the master dataframe
    # captures various column headings across years
    usecols = [
        'pollutant code', 'pollutant_cd', 'pollutant desc', 'pollutant_desc',
        'description', 'total emissions', 'total_emissions', 'emissions uom',
        'uom'
    ]

    for i in range(len(znames)):
        headers = pd.read_csv(z.open(znames[i]), nrows=0)
        cols = [x for x in headers.columns if x in usecols]
        df = pd.concat([df, pd.read_csv(z.open(znames[i]), usecols=cols)])

    # rename columns to match standard format
    df.columns = ['FlowID', 'FlowName', 'FlowAmount', 'UOM']
    # convert LB/TON to KG
    df['FlowAmount'] = np.where(df['UOM'] == 'LB', df['FlowAmount'] * lb_kg,
                                df['FlowAmount'] * USton_kg)
    df = df.drop(columns=['UOM'])
    # sum across all facilities to create national totals
    df = df.groupby(['FlowID', 'FlowName'])['FlowAmount'].sum().reset_index()
    # save national totals to .csv
    df.rename(columns={'FlowAmount': 'FlowAmount[kg]'}, inplace=True)
    log.info(f'saving NEI_{year}_NationalTotals.csv to {DATA_PATH}')
    df.to_csv(DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'),
              index=False)

    # Update validationSets_Sources.csv
    validation_dict = {
        'Inventory':
        'NEI',
        'Version':
        version,
        'Year':
        year,
        'Name':
        'NEI Data',
        'URL':
        url,
        'Criteria':
        'Data Summaries tab, Facility-level by '
        'Pollutant zip file download, summed to national level',
    }
    update_validationsets_sources(validation_dict)