def update_validationsets_sources(validation_dict, date_acquired=False): """Add or replaces metadata dictionary of validation reference dataset to the validation sets sources file. :param validation_dict: dictionary of validation metadata :param date_acquired: """ if not date_acquired: date = datetime.today().strftime('%d-%b-%Y') validation_dict['Date Acquired'] = date v_table = read_ValidationSets_Sources() existing = v_table.loc[(v_table['Inventory'] == validation_dict['Inventory']) & (v_table['Year'] == validation_dict['Year'])] if len(existing) > 0: i = existing.index[0] v_table = v_table.loc[~v_table.index.isin(existing.index)] line = pd.DataFrame.from_records([validation_dict], index=[(i)]) else: inventories = list(v_table['Inventory']) i = max(loc for loc, val in enumerate(inventories) if val == validation_dict['Inventory']) line = pd.DataFrame.from_records([validation_dict], index=[(i+0.5)]) v_table = v_table.append(line, ignore_index=False) v_table = v_table.sort_index().reset_index(drop=True) log.info("updating ValidationSets_Sources.csv with " f"{validation_dict['Inventory']} {validation_dict['Year']}") v_table.to_csv(DATA_PATH.joinpath('ValidationSets_Sources.csv'), index=False)
def validate_eGRID(year, flowbyfac): """Validate eGRID flowbyfacility data against national totals.""" validation_file = DATA_PATH.joinpath(f"eGRID_{year}_NationalTotals.csv") if not validation_file.is_file(): generate_national_totals(year) log.info('validating data against national totals') egrid_national_totals = pd.read_csv(validation_file, header=0, dtype={"FlowAmount": float}) egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'lbs', lb_kg, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'tons', USton_kg, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'MMBtu', MMBtu_MJ, 'FlowAmount') egrid_national_totals = unit_convert( egrid_national_totals, 'FlowAmount', 'Unit', 'MWh', MWh_MJ, 'FlowAmount') # drop old unit egrid_national_totals.drop('Unit', axis=1, inplace=True) validation_result = validate_inventory(flowbyfac, egrid_national_totals, group_by='flow', tolerance=5.0) write_validation_result('eGRID', year, validation_result)
def write_validation_result(inventory_acronym, year, validation_df): """Write the validation result and associated metadata to local dir. :param inventory_acronym: str for inventory e.g. 'TRI' :param year: str for year e.g. '2016' :param validation_df: df returned from validate_inventory function """ directory = Path(paths.local_path).joinpath('validation') directory.mkdir(parents=True, exist_ok=True) log.info(f'writing validation result to {directory}') validation_df.to_csv(directory.joinpath(f"{inventory_acronym}_{year}.csv"), index=False) # Get metadata on validation dataset validation_set_info_table = read_ValidationSets_Sources() # Get record for year and source validation_set_info = validation_set_info_table[ (validation_set_info_table['Inventory'] == inventory_acronym) & (validation_set_info_table['Year'] == year)] if len(validation_set_info) != 1: log.error('no validation metadata found') return # Convert to Series validation_set_info = validation_set_info.iloc[0, ] # Use the same format an inventory metadata to described the validation set data validation_metadata = dict(source_metadata) validation_metadata['SourceFileName'] = validation_set_info['Name'] validation_metadata['SourceVersion'] = validation_set_info['Version'] validation_metadata['SourceURL'] = validation_set_info['URL'] validation_metadata['SourceAcquisitionTime'] = validation_set_info['Date Acquired'] validation_metadata['Criteria'] = validation_set_info['Criteria'] # Write metadata to file write_metadata(inventory_acronym + '_' + year, validation_metadata, datatype="validation")
def validate_national_totals(inv, TRIyear): log.info('validating data against national totals') filename = DATA_PATH.joinpath(f'TRI_{TRIyear}_NationalTotals.csv') if filename.is_file(): tri_national_totals = pd.read_csv(filename, header=0, dtype={"FlowAmount": float}) tri_national_totals['FlowAmount_kg'] = 0 tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount') # drop old amount and units tri_national_totals.drop('FlowAmount', axis=1, inplace=True) tri_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) inv = map_to_fedefl(inv) if inv is not None: validation_result = validate_inventory(inv, tri_national_totals, group_by='flow', tolerance=5.0) write_validation_result('TRI', TRIyear, validation_result) else: log.warning(f'validation file for TRI_{TRIyear} does not exist. ' 'Please run option B')
def import_or_download_table(filepath, table, year, m): # if data already exists on local network, import the data if filepath.is_file(): log.info(f'Importing data from {table}') table_df, creation_time = import_table(filepath, get_time=True) m.add(time=creation_time, filename=filepath, filetype='Database', url=generate_url(table, report_year=year, row_start='', output_ext='CSV')) # otherwise, download the data and save to the network else: # determine number of rows in subpart emissions table row_count = get_row_count(table, report_year=year) log.info('Downloading %s (rows: %i)', table, row_count) # download data in chunks table_df = download_chunks(table=table, table_count=row_count, m=m, report_year=year, filepath=filepath) if table_df is None: return None # drop any unnamed columns table_df = table_df.drop(columns=table_df.columns[ table_df.columns.str.contains('unnamed', case=False)])
def combine_DMR_inventory(year, nutrient=''): """Loop through pickled data and combines into a dataframe.""" path = OUTPUT_PATH.joinpath(year) if not path.is_dir(): raise stewi.exceptions.DataNotFoundError output_df = pd.DataFrame() filestub = '' if nutrient: filestub = nutrient + '_' log.info(f'reading stored DMR queries by state for {nutrient}...') else: log.info('reading stored DMR queries by state...') for state in STATES: log.debug(f'accessing data for {state}') filepath = path.joinpath(f'{filestub}state_{state}.pickle') result = unpickle(filepath) if result is None: log.warning(f'No data found for {state}. Retrying query...') if (query_dmr(year=year, sic_list=None, state_list=[state], nutrient=nutrient).get(state) == 'success'): result = unpickle(filepath) if result is not None: output_df = pd.concat([output_df, result], ignore_index=True) return output_df
def validate_state_totals(df, year): """Generate validation by state, sums across species. Details on results by state can be found in the search results help website https://echo.epa.gov/help/loading-tool/water-pollution-search/search-results-help-dmr """ filepath = DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv") if not filepath.is_file(): download_state_totals_validation(year) log.info('validating against state totals') reference_df = pd.read_csv(filepath) reference_df['FlowAmount'] = 0.0 reference_df = unit_convert(reference_df, 'FlowAmount', 'Unit', 'lb', lb_kg, 'Amount') reference_df = reference_df[['FlowName', 'State', 'FlowAmount']] # to match the state totals, only compare NPD facilities, and remove some flows flow_exclude = pd.read_csv(DMR_DATA_PATH.joinpath('DMR_state_filter_list.csv')) state_flow_exclude_list = flow_exclude['POLLUTANT_DESC'].to_list() dmr_by_state = df[~df['FlowName'].isin(state_flow_exclude_list)] dmr_by_state = dmr_by_state[dmr_by_state['PermitTypeCode'] == 'NPD'] dmr_by_state = dmr_by_state[['State', 'FlowAmount']] dmr_by_state = dmr_by_state[['State', 'FlowAmount'] ].groupby('State').sum().reset_index() dmr_by_state['FlowName'] = 'All' validation_df = validate_inventory(dmr_by_state, reference_df, group_by="state") write_validation_result('DMR', year, validation_df)
def get_fm_file(file_name, download_if_missing=False): """Read facilitymatcher file, if not present, generate it. :param file_name: str, can be 'FacilityMatchList_forStEWI' or 'FRS_NAICSforStEWI' :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally """ file_meta = set_facilitymatcher_meta(file_name, category='') df = load_preprocessed_output(file_meta, paths) if df is None: log.info(f'{file_name} not found in {output_dir}, ' 'writing facility matches to file') if download_if_missing: download_from_remote(file_meta, paths) elif file_name == 'FacilityMatchList_forStEWI': write_fm.write_facility_matches() elif file_name == 'FRS_NAICSforStEWI': write_naics.write_NAICS_matches() df = load_preprocessed_output(file_meta, paths) col_dict = {"FRS_ID": "str", "FacilityID": "str", "NAICS": "str"} for k, v in col_dict.items(): if k in df: df[k] = df[k].astype(v) return df
def extract_flows_for_chemical_matcher(): log.info('generating chemical matches from local flow lists') # First loop through flows lists to create a list of all unique flows source_dict = stewi.getAvailableInventoriesandYears(stewiformat='flow') all_list_names = pd.DataFrame(columns=["FlowName", "FlowID"]) for source in source_dict.keys(): list_names_years = pd.DataFrame() for year in source_dict[source]: list_names = pd.DataFrame() list_names = stewi.getInventoryFlows(source, year) list_names = list_names[flowlist_cols[source]] list_names = list_names.drop_duplicates() list_names_years = pd.concat([list_names_years, list_names], sort=False) if source == 'TRI': list_names_years['FlowID'] = list_names_years['FlowID'].apply( lambda x: x.lstrip('0')) list_names_years['FlowID'] = list_names_years['FlowID'].apply( lambda x: x.replace('-', '')) list_names_years = list_names_years.drop_duplicates() list_names_years['Source'] = source all_list_names = pd.concat([all_list_names, list_names_years], sort=False) # Drop duplicates from lists with same names all_list_names.drop_duplicates(inplace=True) # Reset index after removing flows all_list_names.reset_index(inplace=True, drop=True) return all_list_names
def check_for_file(filepath: Path, state) -> bool: if filepath.is_file(): log.debug(f'file already exists for {state}, skipping') return True else: log.info(f'executing query for {state}') return False
def generate_national_totals(year): """Download and process eGRID national totals for validation. Resulting file is stored in repository """ log.info(f'Processing eGRID national totals for validation of {year}') totals_dict = {'USHTIANT': 'Heat', 'USNGENAN': 'Electricity', #'USETHRMO':'Steam', #PLNTYR sheet 'USNOXAN': 'Nitrogen oxides', 'USSO2AN': 'Sulfur dioxide', 'USCO2AN': 'Carbon dioxide', 'USCH4AN': 'Methane', 'USN2OAN': 'Nitrous oxide', } us_totals = extract_eGRID_excel(year, 'US', index='code') us_totals = us_totals[list(totals_dict.keys())] us_totals.rename(columns=totals_dict, inplace=True) us_totals = us_totals.transpose().reset_index() us_totals = us_totals.rename(columns={'index': 'FlowName', 0: 'FlowAmount'}) steam_df = extract_eGRID_excel(year, 'PLNT', index='code') steam_total = steam_df['USETHRMO'].sum() us_totals = us_totals.append({'FlowName': 'Steam', 'FlowAmount': steam_total}, ignore_index=True) flow_compartments = pd.read_csv(eGRID_DATA_DIR .joinpath('eGRID_flow_compartments.csv'), usecols=['FlowName', 'Compartment']) us_totals = us_totals.merge(flow_compartments, how='left', on='FlowName') us_totals.loc[(us_totals['FlowName'] == 'Carbon dioxide') | (us_totals['FlowName'] == 'Sulfur dioxide') | (us_totals['FlowName'] == 'Nitrogen oxides'), 'Unit'] = 'tons' us_totals.loc[(us_totals['FlowName'] == 'Methane') | (us_totals['FlowName'] == 'Nitrous oxide'), 'Unit'] = 'lbs' us_totals.loc[(us_totals['FlowName'] == 'Heat') | (us_totals['FlowName'] == 'Steam'), 'Unit'] = 'MMBtu' us_totals.loc[(us_totals['FlowName'] == 'Electricity'), 'Unit'] = 'MWh' log.info(f'saving eGRID_{year}_NationalTotals.csv to {DATA_PATH}') us_totals = us_totals[['FlowName', 'Compartment', 'FlowAmount', 'Unit']] us_totals.to_csv(DATA_PATH.joinpath(f'eGRID_{year}_NationalTotals.csv'), index=False) # Update validationSets_Sources.csv validation_dict = {'Inventory': 'eGRID', 'Version': _config[year]['file_version'], 'Year': year, 'Name': 'eGRID Data Files', 'URL': _config[year]['download_url'], 'Criteria': 'Extracted from US Total tab, or for ' 'steam, summed from PLNT tab', } update_validationsets_sources(validation_dict)
def storeCombinedInventory(df, file_name, category=''): """Store the inventory dataframe to local directory based on category.""" meta = set_stewicombo_meta(file_name, category) method_path = output_dir + '/' + meta.category try: log.info(f'saving {meta.name_data} to {method_path}') write_df_to_file(df, paths, meta) except: log.error('Failed to save inventory')
def read_FRS_file(file_name, col_dict): """Retrieve FRS data file stored locally.""" file_meta = set_facilitymatcher_meta(file_name, category=ext_folder) log.info('loading %s from %s', file_meta.name_data, FRSpath) file_meta.name_data = strip_file_extension(file_meta.name_data) file_meta.ext = 'csv' df = load_preprocessed_output(file_meta, paths) df_FRS = pd.DataFrame() for k, v in col_dict.items(): df_FRS[k] = df[k].astype(v) return df_FRS
def download_data(url_params, filepath: Path, sic_list) -> str: df = pd.DataFrame() if sic_list: skip_errors = True else: skip_errors = False sic_list = [''] for sic in sic_list: url_params['p_sic2'] = sic counter = 1 pages = 1 while counter <= pages: url_params['pageno'] = counter url = generate_url(url_params) log.debug(url) for attempt in range(3): try: r = requests.get(url) r.raise_for_status() result = pd.DataFrame(r.json()) break except requests.exceptions.HTTPError as err: log.info(err) time.sleep(20) pass else: log.warning("exceeded max attempts") return 'other_error' if 'Error' in result.index: if skip_errors: log.debug(f"error in sic_{sic}") break elif result['Results'].astype(str).str.contains('Maximum').any(): return 'max_error' else: return 'other_error' elif 'NoDataMsg' in result.index: if skip_errors: log.debug(f"no data in sic_{sic}") break else: return 'no_data' else: df = pd.concat([df, pd.DataFrame(result['Results']['Results'])], ignore_index=True) # set page count pages = int(result['Results']['PageCount']) counter += 1 log.debug(f"saving to {filepath}") pd.to_pickle(df, filepath) return 'success'
def getCombinedInventory(name, category=''): """Read the inventory dataframe from local directory. :param name: str, name of dataset or name of file """ if ("." + WRITE_FORMAT) in name: method_path = output_dir + '/' + category inventory = read_into_df(method_path + name) else: meta = set_stewicombo_meta(name, category) method_path = output_dir + '/' + meta.category inventory = load_preprocessed_output(meta, paths) if inventory is None: log.info(f'{name} not found in {method_path}') else: log.info(f'loaded {name} from {method_path}') return inventory
def store_fm_file(df, file_name, category='', sources=None): """Store the facilitymatcher file to local directory.""" meta = set_facilitymatcher_meta(file_name, category) method_path = output_dir + '/' + meta.category try: log.info(f'saving {meta.name_data} to {method_path}') write_df_to_file(df, paths, meta) metadata_dict = {} if not sources: sources = [] for source in sources: metadata_dict[source] = read_source_metadata(paths, set_facilitymatcher_meta(strip_file_extension(source), ext_folder), force_JSON=True)['tool_meta'] write_fm_metadata(file_name, metadata_dict) except: log.error('Failed to save inventory')
def validate_national_totals(nei_flowbyfacility, year): """Validate against national flow totals.""" log.info('validating flow by facility against national totals') if not DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv').is_file(): generate_national_totals(year) else: log.info('using already processed national totals validation file') nei_national_totals = pd.read_csv( DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'), header=0, dtype={"FlowAmount[kg]": float}) nei_national_totals.rename(columns={'FlowAmount[kg]': 'FlowAmount'}, inplace=True) validation_result = validate_inventory(nei_flowbyfacility, nei_national_totals, group_by='flow', tolerance=5.0) write_validation_result('NEI', year, validation_result)
def validate_national_totals_by_subpart(tab_df, year): log.info('validating flowbyfacility against national totals') # apply CO2e factors for some flows mask = (tab_df['AmountCO2e'].isna() & tab_df['FlowID'].isin(flows_CO2e)) tab_df.loc[mask, 'Flow Description'] = 'Fluorinated GHG Emissions (mt CO2e)' subpart_L_GWPs = load_subpart_l_gwp() subpart_L_GWPs.rename(columns={'Flow Name': 'FlowName'}, inplace=True) tab_df = tab_df.merge(subpart_L_GWPs, how='left', on=['FlowName', 'Flow Description']) tab_df['CO2e_factor'] = tab_df['CO2e_factor'].fillna(1) tab_df.loc[mask, 'AmountCO2e'] = tab_df['FlowAmount'] * tab_df['CO2e_factor'] # for subset of flows, use CO2e for validation mask = tab_df['FlowID'].isin(flows_CO2e) tab_df.loc[mask, 'FlowAmount'] = tab_df['AmountCO2e'] # parse tabulated data tab_df.drop(columns=['FacilityID', 'DataReliability', 'FlowName'], inplace=True) tab_df.rename(columns={ 'Process': 'SubpartName', 'FlowID': 'FlowName' }, inplace=True) # import and parse reference data ref_df = pd.read_csv( DATA_PATH.joinpath(f'GHGRP_{year}_NationalTotals.csv')) ref_df.drop(columns=['FlowName'], inplace=True) ref_df.rename(columns={ 'SUBPART_NAME': 'SubpartName', 'FlowCode': 'FlowName' }, inplace=True) validation_result = validate_inventory(tab_df, ref_df, group_by='subpart') # Update flow names to indicate which are in CO2e validation_result.loc[ validation_result['FlowName'].isin(flows_CO2e), 'FlowName'] = validation_result['FlowName'] + ' (CO2e)' write_validation_result('GHGRP', year, validation_result)
def validate_state_totals(report_year, flowbyfacility): log.info('validating data against state totals') file_path = DATA_PATH.joinpath(f'RCRAInfo_{report_year}_StateTotals.csv') if file_path.is_file(): totals = pd.read_csv(file_path, dtype={"FlowAmount_kg": float}) # Rename cols to match reference format totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) # Validate waste generated against state totals, include only NBR data flowbyfacility['State'] = flowbyfacility['FacilityID'].str[0:2] flowbyfacility = apply_filters_to_inventory( flowbyfacility, 'RCRAInfo', report_year, ['National_Biennial_Report', 'imported_wastes', 'US_States_only']) validation_df = validate_inventory(flowbyfacility, totals, group_by='state') write_validation_result('RCRAInfo', report_year, validation_df) else: log.warning( f'validation file for RCRAInfo_{report_year} does not exist.')
def apply_filters_to_inventory(inventory, inventory_acronym, year, filters, download_if_missing=False): """Apply one or more filters from a passed list to an inventory dataframe. :param inventory: df of stewi inventory of type flowbyfacility or flowbyprocess :param inventory_acronym: str of inventory e.g. 'NEI' :param year: year as number like 2010 :param filters: a list of named filters to apply to inventory :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: DataFrame of filtered inventory """ if 'filter_for_LCI' in filters: for name in filter_config['filter_for_LCI']['filters']: if name not in filters: filters.append(name) compare_to_available_filters(filters) if 'US_States_only' in filters: log.info('filtering for US states') inventory = filter_states(inventory, inventory_acronym=inventory_acronym, year=year, download_if_missing=download_if_missing) if inventory_acronym == 'DMR' and 'remove_duplicate_organic_enrichment' in filters: from stewi.DMR import remove_duplicate_organic_enrichment inventory = remove_duplicate_organic_enrichment(inventory) if inventory_acronym == 'RCRAInfo' and 'National_Biennial_Report' in filters: log.info('filtering for National Biennial Report') fac_list = read_inventory('RCRAInfo', year, StewiFormat.FACILITY, download_if_missing) fac_list = fac_list[['FacilityID', 'Generator ID Included in NBR'] ].drop_duplicates(ignore_index=True) inventory = inventory.merge(fac_list, how='left') inventory = inventory[inventory['Generator ID Included in NBR'] == 'Y'] inventory = inventory[inventory['Source Code'] != 'G61'] inventory = inventory[inventory['Generator Waste Stream Included in NBR'] == 'Y'] if inventory_acronym == 'RCRAInfo' and 'imported_wastes' in filters: log.info('removing imported wastes') imp_source_codes = filter_config['imported_wastes']['parameters']['source_codes'] inventory = inventory[~inventory['Source Code'].isin(imp_source_codes)] if 'flows_for_LCI' in filters: flow_filter_list = filter_config['flows_for_LCI']['parameters'].get(inventory_acronym) if flow_filter_list is not None: log.info('removing flows not relevant for LCI') inventory = inventory[~inventory['FlowName'].isin(flow_filter_list)] return inventory
def get_SRSInfo_for_program_list(inventory): # See all lists # https://cdxnodengn.epa.gov/cdx-srs-rest/reference/substance_lists # Base URL for queries substancesbylistname = 'substances/list_acronym/' srs_flow_df = pd.DataFrame() for listname in inventory_to_SRSlist_acronymns[inventory]: log.debug('Getting %s', listname) lists_of_interest = obtain_list_names(listname) url = base + substancesbylistname + urllib.parse.quote(listname) flow_info = query_SRS_for_program_list(url, inventory, lists_of_interest) if len(flow_info) == 0: log.info(f'No flows found for {listname}') srs_flow_df = pd.concat([srs_flow_df, flow_info]) srs_flow_df.drop_duplicates(inplace=True) if (inventory == 'TRI'): srs_flow_df['PGM_ID'] = srs_flow_df['PGM_ID'].apply( lambda x: str(x).lstrip('0')) srs_flow_df.sort_values(by='PGM_ID', inplace=True) return srs_flow_df
def extract_TRI_data_files(link_zip, files, year): r_file = requests.get(link_zip) for file in files: df_columns = pd.read_csv( TRI_DATA_PATH.joinpath(f'TRI_File_{file}_columns.txt'), header=0) columns = list(df_columns['Names']) filename = f'US_{file}_{year}' dic = {} i = 0 with zipfile.ZipFile(io.BytesIO(r_file.content)) as z: with io.TextIOWrapper(z.open(filename + '.txt', mode='r'), errors='replace') as txtfile: for line in txtfile: dic[i] = pd.Series(re.split( "\t", line)).truncate(after=len(columns) - 1) i += 1 # remove the first row in the dictionary which is the original headers del dic[0] df = pd.DataFrame.from_dict(dic, orient='index') df.columns = columns OUTPUT_PATH.mkdir(parents=True, exist_ok=True) df.to_csv(OUTPUT_PATH.joinpath(f'{filename}.csv'), index=False) log.info(f'{filename}.csv saved to {OUTPUT_PATH}')
def download_eGRID(year): """Download eGRID files from EPA website.""" log.info(f'downloading eGRID data for {year}') download_url = _config[year]['download_url'] egrid_file_name = _config[year]['file_name'] r = make_url_request(download_url) # extract .xlsx workbook if year == '2016' or year == '2014': z = zipfile.ZipFile(io.BytesIO(r.content)) workbook = z.read(egrid_file_name) else: workbook = r.content # save .xlsx workbook to destination directory destination = OUTPUT_PATH.joinpath(egrid_file_name) # if destination folder does not already exist, create it OUTPUT_PATH.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as output: output.write(workbook) log.info(f'{egrid_file_name} saved to {OUTPUT_PATH}')
def standardize_output(year, source='Point'): """Read and parses NEI data. :param year : str, Year of NEI dataset :returns nei: DataFrame of parsed NEI data. """ nei = pd.DataFrame() # read in nei files and concatenate all nei files into one dataframe nei_file_path = _config[year]['file_name'] for file in nei_file_path: filename = OUTPUT_PATH.joinpath(file) if not filename.is_file(): log.info(f'{file} not found in {OUTPUT_PATH}, ' 'downloading source data') # download source file and metadata file_meta = set_stewi_meta(strip_file_extension(file)) file_meta.category = EXT_DIR file_meta.tool = file_meta.tool.lower() download_from_remote(file_meta, paths) # concatenate all other files log.info(f'reading NEI data from {filename}') nei = pd.concat([nei, read_data(year, filename)]) log.debug(f'{str(len(nei))} records') # convert TON to KG nei['FlowAmount'] = nei['FlowAmount'] * USton_kg log.info('adding Data Quality information') if source == 'Point': nei_reliability_table = get_reliability_table_for_source('NEI') nei_reliability_table['Code'] = nei_reliability_table['Code'].astype( float) nei['ReliabilityScore'] = nei['ReliabilityScore'].astype(float) nei = nei.merge(nei_reliability_table, left_on='ReliabilityScore', right_on='Code', how='left') nei['DataReliability'] = nei['DQI Reliability Score'] # drop Code and DQI Reliability Score columns nei = nei.drop( columns=['Code', 'DQI Reliability Score', 'ReliabilityScore']) nei['Compartment'] = 'air' """ # Modify compartment based on stack height (ft) nei.loc[nei['StackHeight'] < 32, 'Compartment'] = 'air/ground' nei.loc[(nei['StackHeight'] >= 32) & (nei['StackHeight'] < 164), 'Compartment'] = 'air/low' nei.loc[(nei['StackHeight'] >= 164) & (nei['StackHeight'] < 492), 'Compartment'] = 'air/high' nei.loc[nei['StackHeight'] >= 492, 'Compartment'] = 'air/very high' """ else: nei['DataReliability'] = 3 # add Source column nei['Source'] = source nei.reset_index(drop=True, inplace=True) return nei
def download_state_totals_validation(year): """Generate file of state totals downloaded from echo as csv for validation. Annual totals are stored in the repository. """ log.info('generating state totals') # https://echo.epa.gov/trends/loading-tool/get-data/state-statistics url = _config['state_url'].replace("__year__", year) state_csv = pd.read_csv(url, header=2) state_totals = pd.DataFrame() state_totals['state_name'] = state_csv['State'] state_totals['FlowName'] = 'All' state_totals['Compartment'] = 'water' state_totals['Amount'] = state_csv['Total Pollutant Pounds (lb/yr) for Majors'] +\ state_csv['Total Pollutant Pounds (lb/yr) for Non-Majors'] state_totals['Unit'] = 'lb' state_names = states_df[['states', 'state_name']] state_totals = state_totals.merge(state_names, how='left', on='state_name') state_totals.drop(columns=['state_name'], inplace=True) state_totals.dropna(subset=['states'], inplace=True) state_totals.rename(columns={'states': 'State'}, inplace=True) log.info(f'saving DMR_{year}_StateTotals.csv to {DATA_PATH}') state_totals.to_csv(DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv"), index=False) # Update validationSets_Sources.csv validation_dict = {'Inventory': 'DMR', #'Version': '', 'Year': year, 'Name': 'State statistics', 'URL': 'https://echo.epa.gov/trends/loading-tool/' 'get-data/state-statistics', 'Criteria': 'Check totals by state', } update_validationsets_sources(validation_dict)
def organize_br_reporting_files_by_year(tables, year): """Consolidate BR_REPORTING files to single csv.""" year = int(year) for table in tables: if 'BR_REPORTING' in table: log.info(f'organizing data for {table} from {str(year)}...') linewidthsdf = pd.read_csv( RCRA_DATA_PATH.joinpath('RCRA_FlatFile_LineComponents.csv')) fields = linewidthsdf['Data Element Name'].tolist() files = sorted([ file for file in OUTPUT_PATH.glob(f'{table}*{str(year)}*.csv') ]) df_full = pd.DataFrame() for filepath in files: log.info(f'extracting {filepath}') df = pd.read_csv(filepath, header=0, usecols=list(range(0, len(fields))), names=fields, low_memory=False, encoding='utf-8') df = df[df['Report Cycle'].apply( lambda x: str(x).replace('.0', '').isdigit())] if df['Location Street Number'].dtype != 'str': df['Location Street Number'] = df[ 'Location Street Number'].astype(str) df['Location Street Number'] = df[ 'Location Street Number'].apply( lambda x: str(x).replace('.0', '')) df['Report Cycle'] = df['Report Cycle'].astype(int) df = df[df['Report Cycle'] == year] df_full = pd.concat([df_full, df]) DIR_RCRA_BY_YEAR.mkdir(exist_ok=True) filepath = DIR_RCRA_BY_YEAR.joinpath( f'br_reporting_{str(year)}.csv') log.info(f'saving to {filepath}...') df_full.to_csv(filepath, index=False) generate_metadata(year, files, datatype='source') else: log.info(f'skipping {table}')
def download_extract_FRS_combined_national(file=None): """Download and extract file from source to local directory.""" url = FRS_config['url'] log.info('initiating url request from %s', url) request = requests.get(url).content zip_file = zipfile.ZipFile(io.BytesIO(request)) source_dict = dict(source_metadata) source_dict['SourceType'] = 'Zip file' source_dict['SourceURL'] = url if file is None: log.info(f'extracting all FRS files from {url}') name = 'FRS_Files' zip_file.extractall(FRSpath) else: log.info('extracting %s from %s', file, url) zip_file.extract(file, path=FRSpath) source_dict['SourceFileName'] = file name = strip_file_extension(file) source_dict['SourceAcquisitionTime'] = datetime.now().strftime('%d-%b-%Y') write_fm_metadata(name, source_dict, category=ext_folder)
def main(**kwargs): parser = argparse.ArgumentParser(argument_default = argparse.SUPPRESS) parser.add_argument('Option', help = 'What do you want to do:\ [A] Download DMR files from web\ [B] Generate StEWI inventory outputs and\ validate to state totals\ [C] Download state totals', type = str) parser.add_argument('-Y', '--Year', nargs = '+', help = 'What DMR year(s) you want to retrieve', type = str) if len(kwargs) == 0: kwargs = vars(parser.parse_args()) for year in kwargs['Year']: if kwargs['Option'] == 'A': log.info(f"Querying for {year}") # two digit SIC codes from advanced search drop down stripped and formatted as a list sic2 = list(pd.read_csv(DMR_DATA_PATH.joinpath('2_digit_SIC.csv'), dtype={'SIC2': str})['SIC2']) # Query by state, then by SIC-state where necessary result_dict = query_dmr(year=year) log.debug('possible errors: ' + ', '.join( [s for s in result_dict.keys() if result_dict[s] != 'success'])) state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] state_no_data_list = [s for s in result_dict.keys() if result_dict[s] == 'no_data'] if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0): log.info('all states succesfully downloaded') else: if (len(state_max_error_list) > 0): log.error(f"Max error: {' '.join(state_max_error_list)}") if (len(state_no_data_list) > 0): log.error(f"No data error: {' '.join(state_no_data_list)}") log.info('Breaking up queries further by SIC') result_dict = query_dmr(year=year, sic_list=sic2, state_list=state_max_error_list) sic_state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] if len(sic_state_max_error_list) > 0: log.error(f"Max error: {' '.join(sic_state_max_error_list)}") log.info(f"Querying nutrients for {year}") # Query aggregated nutrients data for nutrient in ['N', 'P']: result_dict = query_dmr(year=year, nutrient=nutrient) log.debug('possible errors: ' + ', '.join( [s for s in result_dict.keys() if result_dict[s] != 'success'])) state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] state_no_data_list = [s for s in result_dict.keys() if result_dict[s] == 'no_data'] if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0): log.info(f'all states succesfully downloaded for {nutrient}') else: result_dict = query_dmr(year=year, sic_list=sic2, state_list=state_max_error_list, nutrient=nutrient) # write metadata generate_metadata(year, datatype='source') if kwargs['Option'] == 'B': log.info(f'generating inventories for DMR {year}') state_df = combine_DMR_inventory(year) state_df = filter_states(standardize_df(state_df)) # Validation against state totals is done prior to combining # with aggregated nutrients validate_state_totals(state_df, year) P_df = combine_DMR_inventory(year, nutrient='P') N_df = combine_DMR_inventory(year, nutrient='N') nut_drop_list = read_pollutant_parameter_list() nut_drop_list = nut_drop_list[(nut_drop_list['NITROGEN'] == 'Y') | (nut_drop_list['PHOSPHORUS'] == 'Y')] nut_drop_list = list(set(nut_drop_list['FlowName'])) # Consolidate N and P based flows to reflect nutrient aggregation P_df = consolidate_nutrients(P_df, nut_drop_list, 'P') N_df = consolidate_nutrients(N_df, nut_drop_list, 'N') nutrient_agg_df = pd.concat([P_df, N_df]) nutrient_agg_df = filter_states(standardize_df(nutrient_agg_df)) # Filter out nitrogen and phosphorus flows before combining # with aggregated nutrients dmr_nut_filtered = state_df[~state_df['FlowName'].isin(nut_drop_list)] dmr_df = pd.concat([dmr_nut_filtered, nutrient_agg_df]).reset_index(drop=True) # PermitTypeCode needed for state validation but not maintained dmr_df = dmr_df.drop(columns=['PermitTypeCode']) # generate output for facility facility_columns = ['FacilityID', 'FacilityName', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'County', 'NAICS', 'SIC'] # 'Address' not in DMR dmr_facility = dmr_df[facility_columns].drop_duplicates() store_inventory(dmr_facility, 'DMR_' + year, 'facility') # generate output for flow flow_columns = ['FlowID', 'FlowName'] dmr_flow = dmr_df[flow_columns].drop_duplicates() dmr_flow.sort_values(by=['FlowName'], inplace=True) dmr_flow['Compartment'] = 'water' dmr_flow['Unit'] = 'kg' store_inventory(dmr_flow, 'DMR_' + year, 'flow') # generate output for flowbyfacility fbf_columns = ['FlowName', 'FlowAmount', 'FacilityID', 'DataReliability'] dmr_fbf = dmr_df[fbf_columns].reset_index(drop=True) dmr_fbf = aggregate(dmr_fbf, ['FacilityID', 'FlowName']) dmr_fbf['Compartment'] = 'water' dmr_fbf['Unit'] = 'kg' store_inventory(dmr_fbf, 'DMR_' + year, 'flowbyfacility') # write metadata generate_metadata(year, datatype='inventory') if kwargs['Option'] == 'C': download_state_totals_validation(year)
def generate_national_totals(year): """Generate dataframe of national emissions and save to csv. Requires the chem_release dataset to be downloaded manually prior to running """ filename = TRI_DATA_PATH.joinpath(f'TRI_chem_release_{year}.csv') df = pd.read_csv(filename, header=0) df.replace(',', 0.0, inplace=True) df.replace('.', 0.0, inplace=True) cols = ['Compartment', 'FlowName', 'Unit', 'FlowAmount'] compartments = { 'air': ['Fugitive Air Emissions', 'Point Source Air Emissions'], 'water': ['Surface Water Discharges'], 'soil': [ 'On-site Land Treatment', 'Other On-site Land Disposal', 'Off-site Land Treatment', 'Other Off-site Land Disposal' ] } # remove entries where all values are 0 v = [col for col in df.columns if col != 'Chemical'] df = df.loc[~(df[v] == 0).all(axis=1)] df_National = pd.DataFrame() for compartment, columns in compartments.items(): df_aux = df[['Chemical'] + columns].reset_index(drop=True) for column in columns: df_aux[column] = df_aux[column].str.replace(',', '').astype('float') df_aux['FlowAmount'] = df_aux[columns].sum(axis=1) df_aux.rename(columns={'Chemical': 'FlowName'}, inplace=True) df_aux['Unit'] = 'Pounds' df_aux['Compartment'] = compartment df_National = pd.concat([df_National, df_aux], axis=0, ignore_index=True, sort=True) del df_aux del df df_National['FlowAmount'] = df_National['FlowAmount'].round(3) df_National = df_National[cols] df_National = map_to_fedefl(df_National) if df_National is None: log.warning('Totals not generated') return df_National.sort_values(by=['FlowName', 'Compartment'], inplace=True) log.info(f'saving TRI_{year}_NationalTotals.csv to {DATA_PATH}') df_National.to_csv(DATA_PATH.joinpath(f'TRI_{year}_NationalTotals.csv'), index=False) # Update validationSets_Sources.csv date_created = time.strptime(time.ctime(filename.stat().st_ctime)) date_created = time.strftime('%d-%b-%Y', date_created) validation_dict = { 'Inventory': 'TRI', #'Version': '', 'Year': year, 'Name': 'TRI Explorer', 'URL': 'https://enviro.epa.gov/triexplorer/tri_release.chemical', 'Criteria': 'Year, All of United States, All Chemicals, ' 'All Industries, Details:(Other On-Site Disposal or ' 'Other Releases, Other Off-Site Disposal or Other Releases), ' 'mapped to FEDEFL', 'Date Acquired': date_created, } update_validationsets_sources(validation_dict, date_acquired=True)
def main(**kwargs): parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) parser.add_argument('Option', help='What do you want to do:\ [A] Download and TRI flat files from TRI Data Plus.\ [B] Format national totals for TRI from download \ national files.\ [C] Generate StEWI inventory files from downloaded files', type=str) parser.add_argument('-Y', '--Year', nargs='+', help='What TRI year you want to retrieve', type=str) parser.add_argument('-F', '--Files', nargs='+', help='What TRI Files you want (e.g., 1a, 2a, etc).\ Check:\ https://www.epa.gov/toxics-release-inventory-tri-program/\ tri-basic-plus-data-files-guides', default=['1a', '3a'], required=False) if len(kwargs) == 0: kwargs = vars(parser.parse_args()) files = kwargs['Files'] for year in kwargs['Year']: if kwargs['Option'] == 'A': log.info('downloading TRI files from source for %s', year) tri_url = _config['url'] if url_is_alive(tri_url): link_zip_TRI = link_zip(tri_url, _config['queries'], year) extract_TRI_data_files(link_zip_TRI, files, year) generate_metadata(year, files, datatype='source') else: log.error('The URL in config.yaml ({}) for TRI is not ' 'reachable.'.format(tri_url)) elif kwargs['Option'] == 'B': # Website for National Totals # https://enviro.epa.gov/triexplorer/tri_release.chemical # Steps: # (1) Select Year of Data, All of United States, All Chemicals, # All Industry, and other needed option (this is based on the # desired year) # Columns: check 'Other On-site Disposal or Other Releases' and # 'Other Off-site Disposal or Other Releases' # (2) Export to CSV # (3) Drop the not needed rows, including the extra dioxin row # at the bottom # (4) Organize the columns as they are needed (check existing files) # (5) Save the file like TRI_chem_release_year.csv in data folder # (6) Run this code generate_national_totals(year) elif kwargs['Option'] == 'C': log.info(f'generating TRI inventory from files for {year}') Generate_TRI_files_csv(year, files) generate_metadata(year, files, datatype='inventory')