def extacting_TRI_data_files(link_zip, files, year): external_dir = set_dir(data_dir + '../../../') r_file = requests.get(link_zip) for file in files: df_columns = pd.read_csv(data_dir + 'TRI_File_' + file + '_columns.txt', header=0) columns = list(df_columns['Names']) n_columns = len(columns) with zipfile.ZipFile(io.BytesIO(r_file.content)) as z: z.extract('US_' + file + '_' + year + '.txt', external_dir + 'TRI') df = pd.read_csv( external_dir + 'TRI/US_' + file + '_' + year + '.txt', header=None, encoding='ISO-8859-1', error_bad_lines=False, sep='\t', low_memory=False, skiprows=[0], lineterminator='\n', usecols=range(n_columns)) # avoiding \r\n created in Windows OS df.columns = columns df.to_csv(external_dir + 'TRI/US_' + file + '_' + year + '.txt', sep='\t', index=False)
def organizing_files_by_year(Tables, Path, Years_saved): for Table in Tables: # Get file columns widths dir_RCRA_by_year = set_dir(Path + 'RCRAInfo_by_year/') linewidthsdf = pd.read_csv(data_dir + 'RCRA_FlatFile_LineComponents.csv') BRwidths = linewidthsdf['Size'].astype(int).tolist() BRnames = linewidthsdf['Data Element Name'].tolist() Files = [ file for file in os.listdir(Path) if ((file.startswith(Table)) & file.endswith('.txt')) ] Files.sort() for File in Files: df = pd.read_fwf(Path + File, widths = BRwidths,\ header = None, names = BRnames, encoding = 'utf-8') df.sort_values(by=['Report Cycle']) df = df[df['Report Cycle'].apply(lambda x: str(x).isnumeric())] df['Report Cycle'] = df['Report Cycle'].astype(int) df = df[~df['Report Cycle'].isin(Years_saved)] Years = list(df['Report Cycle'].unique()) for Year in Years: if re.match(r'\d{4}', str(int(Year))): df_year = df[df['Report Cycle'] == Year] Path_directory = dir_RCRA_by_year + 'br_reporting_' + str( int(Year)) + '.txt' condition = True while condition: try: if os.path.exists(Path_directory): with open(Path_directory, 'a') as f: df_year.to_csv(f, header=False, sep='\t', index=False) else: df_year.to_csv(Path_directory, sep='\t', index=False) condition = False except UnicodeEncodeError: for column in df_year: if df_year[column].dtype == object: df_year[column] = df_year[column].map(lambda x: x.replace(u'\uFFFD', '?') \ if type(x) == str else x) condition = True else: continue
def import_TRI_by_release_type(d, year): # Import TRI file external_dir = set_dir(data_dir + '../../../') tri_release_output_fieldnames = [ 'FacilityID', 'CAS', 'FlowName', 'Unit', 'FlowAmount', 'Basis of Estimate', 'ReleaseType' ] tri = pd.DataFrame() for k, v in d.items(): #create a data type dictionary dtype_dict = { 'TRIFID': "str", 'CHEMICAL NAME': "str", 'CAS NUMBER': "str", 'UNIT OF MEASURE': "str" } #If a basis of estimate field is present, set its type to string if len(v) > 5: dtype_dict[v[5]] = "str" if (k == 'offsiteland') | (k == 'offsiteother'): file = '3a' else: file = '1a' tri_csv = external_dir + 'TRI/US_' + file + '_' + year + '.txt' tri_part = pd.read_csv( tri_csv, sep='\t', header=0, usecols=v, dtype=dtype_dict, na_values=['NO'], error_bad_lines=False, low_memory=False, converters={v[4]: lambda x: pd.to_numeric(x, errors='coerce')}) tri_part['ReleaseType'] = k tri_part.columns = tri_release_output_fieldnames tri = pd.concat([tri, tri_part]) return tri
def Generate_TRI_files_csv(TRIyear, Files): _config = config()['databases']['TRI'] tri_url = _config['url'] link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear) regex = re.compile( r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip') tri_version = re.search(regex, link_zip_TRI).group(1) if not tri_version: tri_version = 'last' tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt') keys = imp_fields(data_dir + 'TRI_keys.txt') # the same function can be used import_facility = tri_required_fields[0:10] values = list() for p in range(len(keys)): start = 13 + 2 * p end = start + 1 values.append(concat_req_field(tri_required_fields[start:end + 1])) # Create a dictionary that had the import fields for each release type to use in import process import_dict = dict_create(keys, values) # Build the TRI DataFrame tri = import_TRI_by_release_type(import_dict, TRIyear) # drop NA for Amount, but leave in zeros tri = tri.dropna(subset=['FlowAmount']) tri = strip_coln_white_space(tri, 'Basis of Estimate') #Convert to float if there are errors - be careful with this line if tri['FlowAmount'].values.dtype != 'float64': tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce') #Drop 0 for FlowAmount tri = tri[tri['FlowAmount'] != 0] # Import reliability scores for TRI tri_reliability_table = reliability_table[reliability_table['Source'] == 'TRI'] tri_reliability_table.drop('Source', axis=1, inplace=True) #Merge with reliability table to get tri = pd.merge(tri, tri_reliability_table, left_on='Basis of Estimate', right_on='Code', how='left') # Fill NAs with 5 for DQI reliability score tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5) # Drop unneeded columns tri.drop('Basis of Estimate', axis=1, inplace=True) tri.drop('Code', axis=1, inplace=True) # Replace source info with Context source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv' source_to_context = pd.read_csv(source_cnxt) tri = pd.merge(tri, source_to_context, how='left') # Convert units to ref mass unit of kg # Create a new field to put converted amount in tri['Amount_kg'] = 0.0 tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount') tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount') # drop old amount and units tri.drop('FlowAmount', axis=1, inplace=True) tri.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True) tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'}, inplace=True) #Drop release type tri.drop('ReleaseType', axis=1, inplace=True) #Group by facility, flow and compartment to aggregate different release types grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment'] # Create a specialized weighted mean function to use for aggregation of reliability wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"]) # Groupby and aggregate with your dictionary: tri = tri.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'ReliabilityScore': wm }) tri = tri.reset_index() #VALIDATE tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear + '_NationalTotals.csv', header=0, dtype={"FlowAmount": np.float}) tri_national_totals['FlowAmount_kg'] = 0 tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg', 'Unit', 'Pounds', 0.4535924, 'FlowAmount') # drop old amount and units tri_national_totals.drop('FlowAmount', axis=1, inplace=True) tri_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) validation_result = validate_inventory(tri, tri_national_totals, group_by='flow', tolerance=5.0) write_validation_result('TRI', TRIyear, validation_result) #FLOWS flows = tri.groupby(['FlowName', 'CAS', 'Compartment']).count().reset_index() #stack by compartment flowsdf = flows[['FlowName', 'CAS', 'Compartment']] flowsdf['FlowID'] = flowsdf['CAS'] #export chemicals #!!!Still needs CAS number and FlowID flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv', index=False) #FLOW BY FACILITY #drop CAS tri.drop(columns=['CAS'], inplace=True) tri_file_name = 'TRI_' + TRIyear + '.csv' tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False) #FACILITY ##Import and handle TRI facility data tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' + TRIyear + '.txt', sep='\t', header=0, usecols=import_facility, error_bad_lines=False, low_memory=False) #get unique facilities tri_facility_unique_ids = pd.unique(tri_facility['TRIFID']) tri_facility_unique_rows = tri_facility.drop_duplicates() #Use group by to elimiate additional ID duplicates #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID']) #tri_facility_final = tri_facility_unique_rows_agg.aggregate() tri_facility_final = tri_facility_unique_rows #rename columns TRI_facility_name_crosswalk = { 'TRIFID': 'FacilityID', 'FACILITY NAME': 'FacilityName', 'FACILITY STREET': 'Address', 'FACILITY CITY': 'City', 'FACILITY COUNTY': 'County', 'FACILITY STATE': 'State', 'FACILITY ZIP CODE': 'Zip', 'PRIMARY NAICS CODE': 'NAICS', 'LATITUDE': 'Latitude', 'LONGITUDE': 'Longitude' } tri_facility_final.rename(columns=TRI_facility_name_crosswalk, inplace=True) tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear + '.csv', index=False) # Record TRI metadata external_dir = set_dir(data_dir + '../../../') for file in Files: tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt' try: retrieval_time = os.path.getctime(tri_csv) except: retrieval_time = time.time() tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time) tri_metadata['SourceFileName'] = get_relpath(tri_csv) tri_metadata['SourceURL'] = tri_url tri_metadata['SourceVersion'] = tri_version write_metadata('TRI', TRIyear, tri_metadata)
help='What RCRAInfo tables you want.\ Check:\ https://rcrainfopreprod.epa.gov/rcrainfo-help/application/publicHelp/index.htm', required=False, default=[None]) args = parser.parse_args() #Metadata BR_meta = globals.inventory_metadata #RCRAInfo url _config = config()['databases']['RCRAInfo'] RCRAfInfoflatfileURL = _config['url'] RCRAInfopath = set_dir(data_dir + "../../../RCRAInfo/") ##Adds sepcified Year to BR_REPORTING table tables = args.Tables if 'BR_REPORTING' in tables: args.Tables[tables.index( 'BR_REPORTING')] = 'BR_REPORTING' + '_' + args.Year if args.Option == 'A': query = _config['queries']['Table_of_tables'] download_zip(RCRAfInfoflatfileURL, RCRAInfopath, args.Tables, query) elif args.Option == 'B': regex = re.compile(r'RCRAInfo_(\d{4})')
#NEI import and process to Standardized EPA output format #This script uses the NEI data exports from EIS. from stewi.globals import set_dir,output_dir,data_dir, write_metadata,inventory_metadata,get_relpath,unit_convert,\ validate_inventory,write_validation_result,USton_kg,lb_kg import pandas as pd import numpy as np import os import time report_year = '2016' external_dir = set_dir('../NEI/') nei_required_fields = pd.read_table(data_dir + 'NEI_required_fields.csv', sep=',').fillna('Null') nei_file_path = pd.read_table(data_dir + 'NEI_' + report_year + '_file_path.csv', sep=',').fillna('Null') def read_data(source, file): #tmp = pd.Series(list(nei_required_fields[source]), index=list(nei_required_fields['StandardizedEPA'])) file_result = pd.DataFrame( columns=list(nei_required_fields['StandardizedEPA'])) # read nei file by chunks for file_chunk in pd.read_table( external_dir + file, sep=',', usecols=list(set(nei_required_fields[source]) - set(['Null'])), chunksize=100000,