def validate_state_totals(df, year): """Generate validation by state, sums across species. Details on results by state can be found in the search results help website https://echo.epa.gov/help/loading-tool/water-pollution-search/search-results-help-dmr """ filepath = DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv") if not filepath.is_file(): download_state_totals_validation(year) log.info('validating against state totals') reference_df = pd.read_csv(filepath) reference_df['FlowAmount'] = 0.0 reference_df = unit_convert(reference_df, 'FlowAmount', 'Unit', 'lb', lb_kg, 'Amount') reference_df = reference_df[['FlowName', 'State', 'FlowAmount']] # to match the state totals, only compare NPD facilities, and remove some flows flow_exclude = pd.read_csv(DMR_DATA_PATH.joinpath('DMR_state_filter_list.csv')) state_flow_exclude_list = flow_exclude['POLLUTANT_DESC'].to_list() dmr_by_state = df[~df['FlowName'].isin(state_flow_exclude_list)] dmr_by_state = dmr_by_state[dmr_by_state['PermitTypeCode'] == 'NPD'] dmr_by_state = dmr_by_state[['State', 'FlowAmount']] dmr_by_state = dmr_by_state[['State', 'FlowAmount'] ].groupby('State').sum().reset_index() dmr_by_state['FlowName'] = 'All' validation_df = validate_inventory(dmr_by_state, reference_df, group_by="state") write_validation_result('DMR', year, validation_df)
def validate_national_totals(inv, TRIyear): log.info('validating data against national totals') filename = DATA_PATH.joinpath(f'TRI_{TRIyear}_NationalTotals.csv') if filename.is_file(): tri_national_totals = pd.read_csv(filename, header=0, dtype={"FlowAmount": float}) tri_national_totals['FlowAmount_kg'] = 0 tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount') # drop old amount and units tri_national_totals.drop('FlowAmount', axis=1, inplace=True) tri_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) inv = map_to_fedefl(inv) if inv is not None: validation_result = validate_inventory(inv, tri_national_totals, group_by='flow', tolerance=5.0) write_validation_result('TRI', TRIyear, validation_result) else: log.warning(f'validation file for TRI_{TRIyear} does not exist. ' 'Please run option B')
def validate_eGRID(year, flowbyfac): """Validate eGRID flowbyfacility data against national totals.""" validation_file = DATA_PATH.joinpath(f"eGRID_{year}_NationalTotals.csv") if not validation_file.is_file(): generate_national_totals(year) log.info('validating data against national totals') egrid_national_totals = pd.read_csv(validation_file, header=0, dtype={"FlowAmount": float}) egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'lbs', lb_kg, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'tons', USton_kg, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'MMBtu', MMBtu_MJ, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'MWh', MWh_MJ, 'FlowAmount') # drop old unit egrid_national_totals.drop('Unit', axis=1, inplace=True) validation_result = validate_inventory(flowbyfac, egrid_national_totals, group_by='flow', tolerance=5.0) write_validation_result('eGRID', year, validation_result)
def validate_national_totals(nei_flowbyfacility, year): """Validate against national flow totals.""" log.info('validating flow by facility against national totals') if not DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv').is_file(): generate_national_totals(year) else: log.info('using already processed national totals validation file') nei_national_totals = pd.read_csv( DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'), header=0, dtype={"FlowAmount[kg]": float}) nei_national_totals.rename(columns={'FlowAmount[kg]': 'FlowAmount'}, inplace=True) validation_result = validate_inventory(nei_flowbyfacility, nei_national_totals, group_by='flow', tolerance=5.0) write_validation_result('NEI', year, validation_result)
def validate_national_totals_by_subpart(tab_df, year): log.info('validating flowbyfacility against national totals') # apply CO2e factors for some flows mask = (tab_df['AmountCO2e'].isna() & tab_df['FlowID'].isin(flows_CO2e)) tab_df.loc[mask, 'Flow Description'] = 'Fluorinated GHG Emissions (mt CO2e)' subpart_L_GWPs = load_subpart_l_gwp() subpart_L_GWPs.rename(columns={'Flow Name': 'FlowName'}, inplace=True) tab_df = tab_df.merge(subpart_L_GWPs, how='left', on=['FlowName', 'Flow Description']) tab_df['CO2e_factor'] = tab_df['CO2e_factor'].fillna(1) tab_df.loc[mask, 'AmountCO2e'] = tab_df['FlowAmount'] * tab_df['CO2e_factor'] # for subset of flows, use CO2e for validation mask = tab_df['FlowID'].isin(flows_CO2e) tab_df.loc[mask, 'FlowAmount'] = tab_df['AmountCO2e'] # parse tabulated data tab_df.drop(columns=['FacilityID', 'DataReliability', 'FlowName'], inplace=True) tab_df.rename(columns={ 'Process': 'SubpartName', 'FlowID': 'FlowName' }, inplace=True) # import and parse reference data ref_df = pd.read_csv( DATA_PATH.joinpath(f'GHGRP_{year}_NationalTotals.csv')) ref_df.drop(columns=['FlowName'], inplace=True) ref_df.rename(columns={ 'SUBPART_NAME': 'SubpartName', 'FlowCode': 'FlowName' }, inplace=True) validation_result = validate_inventory(tab_df, ref_df, group_by='subpart') # Update flow names to indicate which are in CO2e validation_result.loc[ validation_result['FlowName'].isin(flows_CO2e), 'FlowName'] = validation_result['FlowName'] + ' (CO2e)' write_validation_result('GHGRP', year, validation_result)
def validate_state_totals(report_year, flowbyfacility): log.info('validating data against state totals') file_path = DATA_PATH.joinpath(f'RCRAInfo_{report_year}_StateTotals.csv') if file_path.is_file(): totals = pd.read_csv(file_path, dtype={"FlowAmount_kg": float}) # Rename cols to match reference format totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) # Validate waste generated against state totals, include only NBR data flowbyfacility['State'] = flowbyfacility['FacilityID'].str[0:2] flowbyfacility = apply_filters_to_inventory( flowbyfacility, 'RCRAInfo', report_year, ['National_Biennial_Report', 'imported_wastes', 'US_States_only']) validation_df = validate_inventory(flowbyfacility, totals, group_by='state') write_validation_result('RCRAInfo', report_year, validation_df) else: log.warning( f'validation file for RCRAInfo_{report_year} does not exist.')