예제 #1
0
def update_validationsets_sources(validation_dict, date_acquired=False):
    """Add or replaces metadata dictionary of validation reference dataset to
    the validation sets sources file.

    :param validation_dict: dictionary of validation metadata
    :param date_acquired:
    """
    if not date_acquired:
        date = datetime.today().strftime('%d-%b-%Y')
        validation_dict['Date Acquired'] = date
    v_table = read_ValidationSets_Sources()
    existing = v_table.loc[(v_table['Inventory'] == validation_dict['Inventory']) &
                           (v_table['Year'] == validation_dict['Year'])]
    if len(existing) > 0:
        i = existing.index[0]
        v_table = v_table.loc[~v_table.index.isin(existing.index)]
        line = pd.DataFrame.from_records([validation_dict], index=[(i)])
    else:
        inventories = list(v_table['Inventory'])
        i = max(loc for loc, val in enumerate(inventories)
                if val == validation_dict['Inventory'])
        line = pd.DataFrame.from_records([validation_dict], index=[(i+0.5)])
    v_table = v_table.append(line, ignore_index=False)
    v_table = v_table.sort_index().reset_index(drop=True)
    log.info("updating ValidationSets_Sources.csv with "
             f"{validation_dict['Inventory']} {validation_dict['Year']}")
    v_table.to_csv(DATA_PATH.joinpath('ValidationSets_Sources.csv'), index=False)
예제 #2
0
def validate_eGRID(year, flowbyfac):
    """Validate eGRID flowbyfacility data against national totals."""
    validation_file = DATA_PATH.joinpath(f"eGRID_{year}_NationalTotals.csv")
    if not validation_file.is_file():
        generate_national_totals(year)
    log.info('validating data against national totals')
    egrid_national_totals = pd.read_csv(validation_file, header=0,
                                        dtype={"FlowAmount": float})
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'lbs',
        lb_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'tons',
        USton_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MMBtu',
        MMBtu_MJ, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MWh',
        MWh_MJ, 'FlowAmount')
    # drop old unit
    egrid_national_totals.drop('Unit', axis=1, inplace=True)
    validation_result = validate_inventory(flowbyfac, egrid_national_totals,
                                           group_by='flow', tolerance=5.0)
    write_validation_result('eGRID', year, validation_result)
예제 #3
0
def write_validation_result(inventory_acronym, year, validation_df):
    """Write the validation result and associated metadata to local dir.

    :param inventory_acronym: str for inventory e.g. 'TRI'
    :param year: str for year e.g. '2016'
    :param validation_df: df returned from validate_inventory function
    """
    directory = Path(paths.local_path).joinpath('validation')
    directory.mkdir(parents=True, exist_ok=True)
    log.info(f'writing validation result to {directory}')
    validation_df.to_csv(directory.joinpath(f"{inventory_acronym}_{year}.csv"),
                         index=False)
    # Get metadata on validation dataset
    validation_set_info_table = read_ValidationSets_Sources()
    # Get record for year and source
    validation_set_info = validation_set_info_table[
        (validation_set_info_table['Inventory'] == inventory_acronym) &
        (validation_set_info_table['Year'] == year)]
    if len(validation_set_info) != 1:
        log.error('no validation metadata found')
        return
    # Convert to Series
    validation_set_info = validation_set_info.iloc[0, ]
    # Use the same format an inventory metadata to described the validation set data
    validation_metadata = dict(source_metadata)
    validation_metadata['SourceFileName'] = validation_set_info['Name']
    validation_metadata['SourceVersion'] = validation_set_info['Version']
    validation_metadata['SourceURL'] = validation_set_info['URL']
    validation_metadata['SourceAcquisitionTime'] = validation_set_info['Date Acquired']
    validation_metadata['Criteria'] = validation_set_info['Criteria']
    # Write metadata to file
    write_metadata(inventory_acronym + '_' + year, validation_metadata,
                   datatype="validation")
예제 #4
0
def validate_national_totals(inv, TRIyear):
    log.info('validating data against national totals')
    filename = DATA_PATH.joinpath(f'TRI_{TRIyear}_NationalTotals.csv')
    if filename.is_file():
        tri_national_totals = pd.read_csv(filename,
                                          header=0,
                                          dtype={"FlowAmount": float})
        tri_national_totals['FlowAmount_kg'] = 0
        tri_national_totals = unit_convert(tri_national_totals,
                                           'FlowAmount_kg', 'Unit', 'Pounds',
                                           lb_kg, 'FlowAmount')
        # drop old amount and units
        tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
        tri_national_totals.drop('Unit', axis=1, inplace=True)
        # Rename cols to match reference format
        tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                                   inplace=True)
        inv = map_to_fedefl(inv)
        if inv is not None:
            validation_result = validate_inventory(inv,
                                                   tri_national_totals,
                                                   group_by='flow',
                                                   tolerance=5.0)
            write_validation_result('TRI', TRIyear, validation_result)
    else:
        log.warning(f'validation file for TRI_{TRIyear} does not exist. '
                    'Please run option B')
예제 #5
0
def import_or_download_table(filepath, table, year, m):
    # if data already exists on local network, import the data
    if filepath.is_file():
        log.info(f'Importing data from {table}')
        table_df, creation_time = import_table(filepath, get_time=True)
        m.add(time=creation_time,
              filename=filepath,
              filetype='Database',
              url=generate_url(table,
                               report_year=year,
                               row_start='',
                               output_ext='CSV'))

    # otherwise, download the data and save to the network
    else:
        # determine number of rows in subpart emissions table
        row_count = get_row_count(table, report_year=year)
        log.info('Downloading %s (rows: %i)', table, row_count)
        # download data in chunks
        table_df = download_chunks(table=table,
                                   table_count=row_count,
                                   m=m,
                                   report_year=year,
                                   filepath=filepath)

    if table_df is None:
        return None

    # drop any unnamed columns
    table_df = table_df.drop(columns=table_df.columns[
        table_df.columns.str.contains('unnamed', case=False)])
예제 #6
0
def combine_DMR_inventory(year, nutrient=''):
    """Loop through pickled data and combines into a dataframe."""
    path = OUTPUT_PATH.joinpath(year)
    if not path.is_dir():
        raise stewi.exceptions.DataNotFoundError
    output_df = pd.DataFrame()
    filestub = ''
    if nutrient:
        filestub = nutrient + '_'
        log.info(f'reading stored DMR queries by state for {nutrient}...')
    else:
        log.info('reading stored DMR queries by state...')
    for state in STATES:
        log.debug(f'accessing data for {state}')
        filepath = path.joinpath(f'{filestub}state_{state}.pickle')
        result = unpickle(filepath)
        if result is None:
            log.warning(f'No data found for {state}. Retrying query...')
            if (query_dmr(year=year, sic_list=None,
                         state_list=[state],
                         nutrient=nutrient).get(state) == 'success'):
                result = unpickle(filepath)
        if result is not None:
            output_df = pd.concat([output_df, result], ignore_index=True)
    return output_df
예제 #7
0
def validate_state_totals(df, year):
    """Generate validation by state, sums across species.

    Details on results by state can be found in the search results help website
    https://echo.epa.gov/help/loading-tool/water-pollution-search/search-results-help-dmr
    """
    filepath = DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv")
    if not filepath.is_file():
        download_state_totals_validation(year)
    log.info('validating against state totals')
    reference_df = pd.read_csv(filepath)
    reference_df['FlowAmount'] = 0.0
    reference_df = unit_convert(reference_df, 'FlowAmount',
                                'Unit', 'lb', lb_kg, 'Amount')
    reference_df = reference_df[['FlowName', 'State', 'FlowAmount']]

    # to match the state totals, only compare NPD facilities, and remove some flows
    flow_exclude = pd.read_csv(DMR_DATA_PATH.joinpath('DMR_state_filter_list.csv'))
    state_flow_exclude_list = flow_exclude['POLLUTANT_DESC'].to_list()

    dmr_by_state = df[~df['FlowName'].isin(state_flow_exclude_list)]
    dmr_by_state = dmr_by_state[dmr_by_state['PermitTypeCode'] == 'NPD']

    dmr_by_state = dmr_by_state[['State', 'FlowAmount']]
    dmr_by_state = dmr_by_state[['State', 'FlowAmount']
                                ].groupby('State').sum().reset_index()
    dmr_by_state['FlowName'] = 'All'
    validation_df = validate_inventory(dmr_by_state, reference_df,
                                       group_by="state")
    write_validation_result('DMR', year, validation_df)
예제 #8
0
def get_fm_file(file_name, download_if_missing=False):
    """Read facilitymatcher file, if not present, generate it.
    :param file_name: str, can be 'FacilityMatchList_forStEWI' or
        'FRS_NAICSforStEWI'
    :param download_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    """
    file_meta = set_facilitymatcher_meta(file_name, category='')
    df = load_preprocessed_output(file_meta, paths)
    if df is None:
        log.info(f'{file_name} not found in {output_dir}, '
                 'writing facility matches to file')
        if download_if_missing:
            download_from_remote(file_meta, paths)
        elif file_name == 'FacilityMatchList_forStEWI':
            write_fm.write_facility_matches()
        elif file_name == 'FRS_NAICSforStEWI':
            write_naics.write_NAICS_matches()
        df = load_preprocessed_output(file_meta, paths)
    col_dict = {"FRS_ID": "str",
                "FacilityID": "str",
                "NAICS": "str"}
    for k, v in col_dict.items():
        if k in df:
            df[k] = df[k].astype(v)
    return df
def extract_flows_for_chemical_matcher():
    log.info('generating chemical matches from local flow lists')
    # First loop through flows lists to create a list of all unique flows
    source_dict = stewi.getAvailableInventoriesandYears(stewiformat='flow')
    all_list_names = pd.DataFrame(columns=["FlowName", "FlowID"])
    for source in source_dict.keys():
        list_names_years = pd.DataFrame()
        for year in source_dict[source]:
            list_names = pd.DataFrame()
            list_names = stewi.getInventoryFlows(source, year)
            list_names = list_names[flowlist_cols[source]]
            list_names = list_names.drop_duplicates()
            list_names_years = pd.concat([list_names_years, list_names],
                                         sort=False)
        if source == 'TRI':
            list_names_years['FlowID'] = list_names_years['FlowID'].apply(
                lambda x: x.lstrip('0'))
            list_names_years['FlowID'] = list_names_years['FlowID'].apply(
                lambda x: x.replace('-', ''))
        list_names_years = list_names_years.drop_duplicates()
        list_names_years['Source'] = source
        all_list_names = pd.concat([all_list_names, list_names_years],
                                   sort=False)

    # Drop duplicates from lists with same names
    all_list_names.drop_duplicates(inplace=True)

    # Reset index after removing flows
    all_list_names.reset_index(inplace=True, drop=True)

    return all_list_names
예제 #10
0
def check_for_file(filepath: Path, state) -> bool:
    if filepath.is_file():
        log.debug(f'file already exists for {state}, skipping')
        return True
    else:
        log.info(f'executing query for {state}')
        return False
예제 #11
0
def generate_national_totals(year):
    """Download and process eGRID national totals for validation.

    Resulting file is stored in repository
    """
    log.info(f'Processing eGRID national totals for validation of {year}')
    totals_dict = {'USHTIANT': 'Heat',
                   'USNGENAN': 'Electricity',
                   #'USETHRMO':'Steam', #PLNTYR sheet
                   'USNOXAN': 'Nitrogen oxides',
                   'USSO2AN': 'Sulfur dioxide',
                   'USCO2AN': 'Carbon dioxide',
                   'USCH4AN': 'Methane',
                   'USN2OAN': 'Nitrous oxide',
                   }

    us_totals = extract_eGRID_excel(year, 'US', index='code')
    us_totals = us_totals[list(totals_dict.keys())]
    us_totals.rename(columns=totals_dict, inplace=True)
    us_totals = us_totals.transpose().reset_index()
    us_totals = us_totals.rename(columns={'index': 'FlowName',
                                          0: 'FlowAmount'})

    steam_df = extract_eGRID_excel(year, 'PLNT', index='code')
    steam_total = steam_df['USETHRMO'].sum()
    us_totals = us_totals.append({'FlowName': 'Steam', 'FlowAmount': steam_total},
                                 ignore_index=True)

    flow_compartments = pd.read_csv(eGRID_DATA_DIR
                                    .joinpath('eGRID_flow_compartments.csv'),
                                    usecols=['FlowName', 'Compartment'])
    us_totals = us_totals.merge(flow_compartments, how='left', on='FlowName')

    us_totals.loc[(us_totals['FlowName'] == 'Carbon dioxide') |
                  (us_totals['FlowName'] == 'Sulfur dioxide') |
                  (us_totals['FlowName'] == 'Nitrogen oxides'),
                  'Unit'] = 'tons'
    us_totals.loc[(us_totals['FlowName'] == 'Methane') |
                  (us_totals['FlowName'] == 'Nitrous oxide'),
                  'Unit'] = 'lbs'
    us_totals.loc[(us_totals['FlowName'] == 'Heat') |
                  (us_totals['FlowName'] == 'Steam'),
                  'Unit'] = 'MMBtu'
    us_totals.loc[(us_totals['FlowName'] == 'Electricity'),
                  'Unit'] = 'MWh'
    log.info(f'saving eGRID_{year}_NationalTotals.csv to {DATA_PATH}')
    us_totals = us_totals[['FlowName', 'Compartment', 'FlowAmount', 'Unit']]
    us_totals.to_csv(DATA_PATH.joinpath(f'eGRID_{year}_NationalTotals.csv'), index=False)

    # Update validationSets_Sources.csv
    validation_dict = {'Inventory': 'eGRID',
                       'Version': _config[year]['file_version'],
                       'Year': year,
                       'Name': 'eGRID Data Files',
                       'URL': _config[year]['download_url'],
                       'Criteria': 'Extracted from US Total tab, or for '
                       'steam, summed from PLNT tab',
                       }
    update_validationsets_sources(validation_dict)
예제 #12
0
def storeCombinedInventory(df, file_name, category=''):
    """Store the inventory dataframe to local directory based on category."""
    meta = set_stewicombo_meta(file_name, category)
    method_path = output_dir + '/' + meta.category
    try:
        log.info(f'saving {meta.name_data} to {method_path}')
        write_df_to_file(df, paths, meta)
    except:
        log.error('Failed to save inventory')
예제 #13
0
def read_FRS_file(file_name, col_dict):
    """Retrieve FRS data file stored locally."""
    file_meta = set_facilitymatcher_meta(file_name, category=ext_folder)
    log.info('loading %s from %s', file_meta.name_data, FRSpath)
    file_meta.name_data = strip_file_extension(file_meta.name_data)
    file_meta.ext = 'csv'
    df = load_preprocessed_output(file_meta, paths)
    df_FRS = pd.DataFrame()
    for k, v in col_dict.items():
        df_FRS[k] = df[k].astype(v)
    return df_FRS
예제 #14
0
def download_data(url_params, filepath: Path, sic_list) -> str:
    df = pd.DataFrame()
    if sic_list:
        skip_errors = True
    else:
        skip_errors = False
        sic_list = ['']
    for sic in sic_list:
        url_params['p_sic2'] = sic
        counter = 1
        pages = 1
        while counter <= pages:
            url_params['pageno'] = counter
            url = generate_url(url_params)
            log.debug(url)
            for attempt in range(3):
                try:
                    r = requests.get(url)
                    r.raise_for_status()
                    result = pd.DataFrame(r.json())
                    break
                except requests.exceptions.HTTPError as err:
                    log.info(err)
                    time.sleep(20)
                    pass
            else:
                log.warning("exceeded max attempts")
                return 'other_error'
            if 'Error' in result.index:
                if skip_errors:
                    log.debug(f"error in sic_{sic}")
                    break
                elif result['Results'].astype(str).str.contains('Maximum').any():
                    return 'max_error'
                else:
                    return 'other_error'
            elif 'NoDataMsg' in result.index:
                if skip_errors:
                    log.debug(f"no data in sic_{sic}")
                    break
                else:
                    return 'no_data'
            else:
                df = pd.concat([df, pd.DataFrame(result['Results']['Results'])],
                               ignore_index=True)
                # set page count
                pages = int(result['Results']['PageCount'])
                counter += 1
    log.debug(f"saving to {filepath}")
    pd.to_pickle(df, filepath)
    return 'success'
예제 #15
0
def getCombinedInventory(name, category=''):
    """Read the inventory dataframe from local directory.

    :param name: str, name of dataset or name of file
    """
    if ("." + WRITE_FORMAT) in name:
        method_path = output_dir + '/' + category
        inventory = read_into_df(method_path + name)
    else:
        meta = set_stewicombo_meta(name, category)
        method_path = output_dir + '/' + meta.category
        inventory = load_preprocessed_output(meta, paths)
    if inventory is None:
        log.info(f'{name} not found in {method_path}')
    else:
        log.info(f'loaded {name} from {method_path}')
    return inventory
예제 #16
0
def store_fm_file(df, file_name, category='', sources=None):
    """Store the facilitymatcher file to local directory."""
    meta = set_facilitymatcher_meta(file_name, category)
    method_path = output_dir + '/' + meta.category
    try:
        log.info(f'saving {meta.name_data} to {method_path}')
        write_df_to_file(df, paths, meta)
        metadata_dict = {}
        if not sources:
            sources = []
        for source in sources:
            metadata_dict[source] = read_source_metadata(paths,
                set_facilitymatcher_meta(strip_file_extension(source),
                                         ext_folder),
                force_JSON=True)['tool_meta']
        write_fm_metadata(file_name, metadata_dict)
    except:
        log.error('Failed to save inventory')
예제 #17
0
def validate_national_totals(nei_flowbyfacility, year):
    """Validate against national flow totals."""
    log.info('validating flow by facility against national totals')
    if not DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv').is_file():
        generate_national_totals(year)
    else:
        log.info('using already processed national totals validation file')
    nei_national_totals = pd.read_csv(
        DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'),
        header=0,
        dtype={"FlowAmount[kg]": float})
    nei_national_totals.rename(columns={'FlowAmount[kg]': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(nei_flowbyfacility,
                                           nei_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('NEI', year, validation_result)
예제 #18
0
def validate_national_totals_by_subpart(tab_df, year):
    log.info('validating flowbyfacility against national totals')
    # apply CO2e factors for some flows
    mask = (tab_df['AmountCO2e'].isna() & tab_df['FlowID'].isin(flows_CO2e))
    tab_df.loc[mask,
               'Flow Description'] = 'Fluorinated GHG Emissions (mt CO2e)'
    subpart_L_GWPs = load_subpart_l_gwp()
    subpart_L_GWPs.rename(columns={'Flow Name': 'FlowName'}, inplace=True)
    tab_df = tab_df.merge(subpart_L_GWPs,
                          how='left',
                          on=['FlowName', 'Flow Description'])
    tab_df['CO2e_factor'] = tab_df['CO2e_factor'].fillna(1)
    tab_df.loc[mask,
               'AmountCO2e'] = tab_df['FlowAmount'] * tab_df['CO2e_factor']

    # for subset of flows, use CO2e for validation
    mask = tab_df['FlowID'].isin(flows_CO2e)
    tab_df.loc[mask, 'FlowAmount'] = tab_df['AmountCO2e']

    # parse tabulated data
    tab_df.drop(columns=['FacilityID', 'DataReliability', 'FlowName'],
                inplace=True)
    tab_df.rename(columns={
        'Process': 'SubpartName',
        'FlowID': 'FlowName'
    },
                  inplace=True)

    # import and parse reference data
    ref_df = pd.read_csv(
        DATA_PATH.joinpath(f'GHGRP_{year}_NationalTotals.csv'))
    ref_df.drop(columns=['FlowName'], inplace=True)
    ref_df.rename(columns={
        'SUBPART_NAME': 'SubpartName',
        'FlowCode': 'FlowName'
    },
                  inplace=True)

    validation_result = validate_inventory(tab_df, ref_df, group_by='subpart')
    # Update flow names to indicate which are in CO2e
    validation_result.loc[
        validation_result['FlowName'].isin(flows_CO2e),
        'FlowName'] = validation_result['FlowName'] + ' (CO2e)'
    write_validation_result('GHGRP', year, validation_result)
예제 #19
0
def validate_state_totals(report_year, flowbyfacility):
    log.info('validating data against state totals')
    file_path = DATA_PATH.joinpath(f'RCRAInfo_{report_year}_StateTotals.csv')
    if file_path.is_file():
        totals = pd.read_csv(file_path, dtype={"FlowAmount_kg": float})
        # Rename cols to match reference format
        totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True)
        # Validate waste generated against state totals, include only NBR data
        flowbyfacility['State'] = flowbyfacility['FacilityID'].str[0:2]
        flowbyfacility = apply_filters_to_inventory(
            flowbyfacility, 'RCRAInfo', report_year,
            ['National_Biennial_Report', 'imported_wastes', 'US_States_only'])
        validation_df = validate_inventory(flowbyfacility,
                                           totals,
                                           group_by='state')
        write_validation_result('RCRAInfo', report_year, validation_df)
    else:
        log.warning(
            f'validation file for RCRAInfo_{report_year} does not exist.')
예제 #20
0
def apply_filters_to_inventory(inventory, inventory_acronym, year, filters,
                               download_if_missing=False):
    """Apply one or more filters from a passed list to an inventory dataframe.

    :param inventory: df of stewi inventory of type flowbyfacility or flowbyprocess
    :param inventory_acronym: str of inventory e.g. 'NEI'
    :param year: year as number like 2010
    :param filters: a list of named filters to apply to inventory
    :param download_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: DataFrame of filtered inventory
    """
    if 'filter_for_LCI' in filters:
        for name in filter_config['filter_for_LCI']['filters']:
            if name not in filters:
                filters.append(name)
    compare_to_available_filters(filters)

    if 'US_States_only' in filters:
        log.info('filtering for US states')
        inventory = filter_states(inventory, inventory_acronym=inventory_acronym,
                                  year=year,
                                  download_if_missing=download_if_missing)

    if inventory_acronym == 'DMR' and 'remove_duplicate_organic_enrichment' in filters:
        from stewi.DMR import remove_duplicate_organic_enrichment
        inventory = remove_duplicate_organic_enrichment(inventory)

    if inventory_acronym == 'RCRAInfo' and 'National_Biennial_Report' in filters:
        log.info('filtering for National Biennial Report')
        fac_list = read_inventory('RCRAInfo', year, StewiFormat.FACILITY,
                                  download_if_missing)
        fac_list = fac_list[['FacilityID',
                             'Generator ID Included in NBR']
                            ].drop_duplicates(ignore_index=True)
        inventory = inventory.merge(fac_list, how='left')
        inventory = inventory[inventory['Generator ID Included in NBR'] == 'Y']
        inventory = inventory[inventory['Source Code'] != 'G61']
        inventory = inventory[inventory['Generator Waste Stream Included in NBR'] == 'Y']

    if inventory_acronym == 'RCRAInfo' and 'imported_wastes' in filters:
        log.info('removing imported wastes')
        imp_source_codes = filter_config['imported_wastes']['parameters']['source_codes']
        inventory = inventory[~inventory['Source Code'].isin(imp_source_codes)]

    if 'flows_for_LCI' in filters:
        flow_filter_list = filter_config['flows_for_LCI']['parameters'].get(inventory_acronym)
        if flow_filter_list is not None:
            log.info('removing flows not relevant for LCI')
            inventory = inventory[~inventory['FlowName'].isin(flow_filter_list)]

    return inventory
예제 #21
0
def get_SRSInfo_for_program_list(inventory):
    # See all lists
    # https://cdxnodengn.epa.gov/cdx-srs-rest/reference/substance_lists
    # Base URL for queries
    substancesbylistname = 'substances/list_acronym/'
    srs_flow_df = pd.DataFrame()
    for listname in inventory_to_SRSlist_acronymns[inventory]:
        log.debug('Getting %s', listname)
        lists_of_interest = obtain_list_names(listname)
        url = base + substancesbylistname + urllib.parse.quote(listname)
        flow_info = query_SRS_for_program_list(url, inventory,
                                               lists_of_interest)
        if len(flow_info) == 0:
            log.info(f'No flows found for {listname}')
        srs_flow_df = pd.concat([srs_flow_df, flow_info])
    srs_flow_df.drop_duplicates(inplace=True)
    if (inventory == 'TRI'):
        srs_flow_df['PGM_ID'] = srs_flow_df['PGM_ID'].apply(
            lambda x: str(x).lstrip('0'))
    srs_flow_df.sort_values(by='PGM_ID', inplace=True)
    return srs_flow_df
예제 #22
0
def extract_TRI_data_files(link_zip, files, year):
    r_file = requests.get(link_zip)
    for file in files:
        df_columns = pd.read_csv(
            TRI_DATA_PATH.joinpath(f'TRI_File_{file}_columns.txt'), header=0)
        columns = list(df_columns['Names'])
        filename = f'US_{file}_{year}'
        dic = {}
        i = 0
        with zipfile.ZipFile(io.BytesIO(r_file.content)) as z:
            with io.TextIOWrapper(z.open(filename + '.txt', mode='r'),
                                  errors='replace') as txtfile:
                for line in txtfile:
                    dic[i] = pd.Series(re.split(
                        "\t", line)).truncate(after=len(columns) - 1)
                    i += 1
        # remove the first row in the dictionary which is the original headers
        del dic[0]
        df = pd.DataFrame.from_dict(dic, orient='index')
        df.columns = columns
        OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        df.to_csv(OUTPUT_PATH.joinpath(f'{filename}.csv'), index=False)
        log.info(f'{filename}.csv saved to {OUTPUT_PATH}')
예제 #23
0
def download_eGRID(year):
    """Download eGRID files from EPA website."""
    log.info(f'downloading eGRID data for {year}')

    download_url = _config[year]['download_url']
    egrid_file_name = _config[year]['file_name']

    r = make_url_request(download_url)

    # extract .xlsx workbook
    if year == '2016' or year == '2014':
        z = zipfile.ZipFile(io.BytesIO(r.content))
        workbook = z.read(egrid_file_name)
    else:
        workbook = r.content

    # save .xlsx workbook to destination directory
    destination = OUTPUT_PATH.joinpath(egrid_file_name)
    # if destination folder does not already exist, create it
    OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
    with open(destination, 'wb') as output:
        output.write(workbook)
    log.info(f'{egrid_file_name} saved to {OUTPUT_PATH}')
예제 #24
0
def standardize_output(year, source='Point'):
    """Read and parses NEI data.

    :param year : str, Year of NEI dataset
    :returns nei: DataFrame of parsed NEI data.
    """
    nei = pd.DataFrame()
    # read in nei files and concatenate all nei files into one dataframe
    nei_file_path = _config[year]['file_name']
    for file in nei_file_path:
        filename = OUTPUT_PATH.joinpath(file)
        if not filename.is_file():
            log.info(f'{file} not found in {OUTPUT_PATH}, '
                     'downloading source data')
            # download source file and metadata
            file_meta = set_stewi_meta(strip_file_extension(file))
            file_meta.category = EXT_DIR
            file_meta.tool = file_meta.tool.lower()
            download_from_remote(file_meta, paths)
        # concatenate all other files
        log.info(f'reading NEI data from {filename}')
        nei = pd.concat([nei, read_data(year, filename)])
        log.debug(f'{str(len(nei))} records')
    # convert TON to KG
    nei['FlowAmount'] = nei['FlowAmount'] * USton_kg

    log.info('adding Data Quality information')
    if source == 'Point':
        nei_reliability_table = get_reliability_table_for_source('NEI')
        nei_reliability_table['Code'] = nei_reliability_table['Code'].astype(
            float)
        nei['ReliabilityScore'] = nei['ReliabilityScore'].astype(float)
        nei = nei.merge(nei_reliability_table,
                        left_on='ReliabilityScore',
                        right_on='Code',
                        how='left')
        nei['DataReliability'] = nei['DQI Reliability Score']
        # drop Code and DQI Reliability Score columns
        nei = nei.drop(
            columns=['Code', 'DQI Reliability Score', 'ReliabilityScore'])

        nei['Compartment'] = 'air'
        """
        # Modify compartment based on stack height (ft)
        nei.loc[nei['StackHeight'] < 32, 'Compartment'] = 'air/ground'
        nei.loc[(nei['StackHeight'] >= 32) & (nei['StackHeight'] < 164),
                'Compartment'] = 'air/low'
        nei.loc[(nei['StackHeight'] >= 164) & (nei['StackHeight'] < 492),
                'Compartment'] = 'air/high'
        nei.loc[nei['StackHeight'] >= 492, 'Compartment'] = 'air/very high'
        """
    else:
        nei['DataReliability'] = 3
    # add Source column
    nei['Source'] = source
    nei.reset_index(drop=True, inplace=True)
    return nei
예제 #25
0
def download_state_totals_validation(year):
    """Generate file of state totals downloaded from echo as csv for validation.

    Annual totals are stored in the repository.
    """
    log.info('generating state totals')
    # https://echo.epa.gov/trends/loading-tool/get-data/state-statistics
    url = _config['state_url'].replace("__year__", year)
    state_csv = pd.read_csv(url, header=2)
    state_totals = pd.DataFrame()
    state_totals['state_name'] = state_csv['State']
    state_totals['FlowName'] = 'All'
    state_totals['Compartment'] = 'water'
    state_totals['Amount'] = state_csv['Total Pollutant Pounds (lb/yr) for Majors'] +\
        state_csv['Total Pollutant Pounds (lb/yr) for Non-Majors']
    state_totals['Unit'] = 'lb'
    state_names = states_df[['states', 'state_name']]
    state_totals = state_totals.merge(state_names, how='left',
                                      on='state_name')
    state_totals.drop(columns=['state_name'], inplace=True)
    state_totals.dropna(subset=['states'], inplace=True)
    state_totals.rename(columns={'states': 'State'}, inplace=True)
    log.info(f'saving DMR_{year}_StateTotals.csv to {DATA_PATH}')
    state_totals.to_csv(DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv"),
                        index=False)

    # Update validationSets_Sources.csv
    validation_dict = {'Inventory': 'DMR',
                       #'Version': '',
                       'Year': year,
                       'Name': 'State statistics',
                       'URL': 'https://echo.epa.gov/trends/loading-tool/'
                       'get-data/state-statistics',
                       'Criteria': 'Check totals by state',
                       }
    update_validationsets_sources(validation_dict)
예제 #26
0
def organize_br_reporting_files_by_year(tables, year):
    """Consolidate BR_REPORTING files to single csv."""
    year = int(year)
    for table in tables:
        if 'BR_REPORTING' in table:
            log.info(f'organizing data for {table} from {str(year)}...')
            linewidthsdf = pd.read_csv(
                RCRA_DATA_PATH.joinpath('RCRA_FlatFile_LineComponents.csv'))
            fields = linewidthsdf['Data Element Name'].tolist()
            files = sorted([
                file for file in OUTPUT_PATH.glob(f'{table}*{str(year)}*.csv')
            ])
            df_full = pd.DataFrame()
            for filepath in files:
                log.info(f'extracting {filepath}')
                df = pd.read_csv(filepath,
                                 header=0,
                                 usecols=list(range(0, len(fields))),
                                 names=fields,
                                 low_memory=False,
                                 encoding='utf-8')
                df = df[df['Report Cycle'].apply(
                    lambda x: str(x).replace('.0', '').isdigit())]
                if df['Location Street Number'].dtype != 'str':
                    df['Location Street Number'] = df[
                        'Location Street Number'].astype(str)
                    df['Location Street Number'] = df[
                        'Location Street Number'].apply(
                            lambda x: str(x).replace('.0', ''))
                df['Report Cycle'] = df['Report Cycle'].astype(int)
                df = df[df['Report Cycle'] == year]
                df_full = pd.concat([df_full, df])
            DIR_RCRA_BY_YEAR.mkdir(exist_ok=True)
            filepath = DIR_RCRA_BY_YEAR.joinpath(
                f'br_reporting_{str(year)}.csv')
            log.info(f'saving to {filepath}...')
            df_full.to_csv(filepath, index=False)
            generate_metadata(year, files, datatype='source')
        else:
            log.info(f'skipping {table}')
예제 #27
0
def download_extract_FRS_combined_national(file=None):
    """Download and extract file from source to local directory."""
    url = FRS_config['url']
    log.info('initiating url request from %s', url)
    request = requests.get(url).content
    zip_file = zipfile.ZipFile(io.BytesIO(request))
    source_dict = dict(source_metadata)
    source_dict['SourceType'] = 'Zip file'
    source_dict['SourceURL'] = url
    if file is None:
        log.info(f'extracting all FRS files from {url}')
        name = 'FRS_Files'
        zip_file.extractall(FRSpath)
    else:
        log.info('extracting %s from %s', file, url)
        zip_file.extract(file, path=FRSpath)
        source_dict['SourceFileName'] = file
        name = strip_file_extension(file)
    source_dict['SourceAcquisitionTime'] = datetime.now().strftime('%d-%b-%Y')
    write_fm_metadata(name, source_dict, category=ext_folder)
예제 #28
0
def main(**kwargs):
    parser = argparse.ArgumentParser(argument_default = argparse.SUPPRESS)

    parser.add_argument('Option',
                        help = 'What do you want to do:\
                        [A] Download DMR files from web\
                        [B] Generate StEWI inventory outputs and\
                            validate to state totals\
                        [C] Download state totals',
                        type = str)

    parser.add_argument('-Y', '--Year', nargs = '+',
                        help = 'What DMR year(s) you want to retrieve',
                        type = str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:

        if kwargs['Option'] == 'A':
            log.info(f"Querying for {year}")

            # two digit SIC codes from advanced search drop down stripped and formatted as a list
            sic2 = list(pd.read_csv(DMR_DATA_PATH.joinpath('2_digit_SIC.csv'),
                        dtype={'SIC2': str})['SIC2'])
            # Query by state, then by SIC-state where necessary
            result_dict = query_dmr(year=year)
            log.debug('possible errors: ' + ', '.join(
                [s for s in result_dict.keys()
                 if result_dict[s] != 'success']))
            state_max_error_list = [s for s in result_dict.keys()
                                    if result_dict[s] == 'max_error']
            state_no_data_list = [s for s in result_dict.keys()
                                  if result_dict[s] == 'no_data']
            if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                log.info('all states succesfully downloaded')
            else:
                if (len(state_max_error_list) > 0):
                    log.error(f"Max error: {' '.join(state_max_error_list)}")
                if (len(state_no_data_list) > 0):
                    log.error(f"No data error: {' '.join(state_no_data_list)}")
                log.info('Breaking up queries further by SIC')
                result_dict = query_dmr(year=year, sic_list=sic2,
                                        state_list=state_max_error_list)
                sic_state_max_error_list = [s for s in result_dict.keys()
                                            if result_dict[s] == 'max_error']
                if len(sic_state_max_error_list) > 0:
                    log.error(f"Max error: {' '.join(sic_state_max_error_list)}")

            log.info(f"Querying nutrients for {year}")
            # Query aggregated nutrients data
            for nutrient in ['N', 'P']:
                result_dict = query_dmr(year=year, nutrient=nutrient)
                log.debug('possible errors: ' + ', '.join(
                    [s for s in result_dict.keys()
                     if result_dict[s] != 'success']))
                state_max_error_list = [s for s in result_dict.keys()
                                        if result_dict[s] == 'max_error']
                state_no_data_list = [s for s in result_dict.keys()
                                      if result_dict[s] == 'no_data']
                if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                    log.info(f'all states succesfully downloaded for {nutrient}')
                else:
                    result_dict = query_dmr(year=year, sic_list=sic2,
                                            state_list=state_max_error_list,
                                            nutrient=nutrient)
            # write metadata
            generate_metadata(year, datatype='source')

        if kwargs['Option'] == 'B':
            log.info(f'generating inventories for DMR {year}')
            state_df = combine_DMR_inventory(year)
            state_df = filter_states(standardize_df(state_df))

            # Validation against state totals is done prior to combining
            # with aggregated nutrients
            validate_state_totals(state_df, year)

            P_df = combine_DMR_inventory(year, nutrient='P')
            N_df = combine_DMR_inventory(year, nutrient='N')

            nut_drop_list = read_pollutant_parameter_list()
            nut_drop_list = nut_drop_list[(nut_drop_list['NITROGEN'] == 'Y') |
                                          (nut_drop_list['PHOSPHORUS'] == 'Y')]
            nut_drop_list = list(set(nut_drop_list['FlowName']))

            # Consolidate N and P based flows to reflect nutrient aggregation
            P_df = consolidate_nutrients(P_df, nut_drop_list, 'P')
            N_df = consolidate_nutrients(N_df, nut_drop_list, 'N')

            nutrient_agg_df = pd.concat([P_df, N_df])
            nutrient_agg_df = filter_states(standardize_df(nutrient_agg_df))

            # Filter out nitrogen and phosphorus flows before combining
            # with aggregated nutrients
            dmr_nut_filtered = state_df[~state_df['FlowName'].isin(nut_drop_list)]
            dmr_df = pd.concat([dmr_nut_filtered,
                                nutrient_agg_df]).reset_index(drop=True)

            # PermitTypeCode needed for state validation but not maintained
            dmr_df = dmr_df.drop(columns=['PermitTypeCode'])

            # generate output for facility
            facility_columns = ['FacilityID', 'FacilityName', 'City',
                                'State', 'Zip', 'Latitude', 'Longitude',
                                'County', 'NAICS', 'SIC'] # 'Address' not in DMR
            dmr_facility = dmr_df[facility_columns].drop_duplicates()
            store_inventory(dmr_facility, 'DMR_' + year, 'facility')

            # generate output for flow
            flow_columns = ['FlowID', 'FlowName']
            dmr_flow = dmr_df[flow_columns].drop_duplicates()
            dmr_flow.sort_values(by=['FlowName'], inplace=True)
            dmr_flow['Compartment'] = 'water'
            dmr_flow['Unit'] = 'kg'
            store_inventory(dmr_flow, 'DMR_' + year, 'flow')

            # generate output for flowbyfacility
            fbf_columns = ['FlowName', 'FlowAmount', 'FacilityID',
                           'DataReliability']
            dmr_fbf = dmr_df[fbf_columns].reset_index(drop=True)
            dmr_fbf = aggregate(dmr_fbf, ['FacilityID', 'FlowName'])
            dmr_fbf['Compartment'] = 'water'
            dmr_fbf['Unit'] = 'kg'
            store_inventory(dmr_fbf, 'DMR_' + year, 'flowbyfacility')

            # write metadata
            generate_metadata(year, datatype='inventory')

        if kwargs['Option'] == 'C':
            download_state_totals_validation(year)
예제 #29
0
def generate_national_totals(year):
    """Generate dataframe of national emissions and save to csv.

    Requires the chem_release dataset to be downloaded manually prior to running
    """
    filename = TRI_DATA_PATH.joinpath(f'TRI_chem_release_{year}.csv')
    df = pd.read_csv(filename, header=0)
    df.replace(',', 0.0, inplace=True)
    df.replace('.', 0.0, inplace=True)
    cols = ['Compartment', 'FlowName', 'Unit', 'FlowAmount']
    compartments = {
        'air': ['Fugitive Air Emissions', 'Point Source Air Emissions'],
        'water': ['Surface Water Discharges'],
        'soil': [
            'On-site Land Treatment', 'Other On-site Land Disposal',
            'Off-site Land Treatment', 'Other Off-site Land Disposal'
        ]
    }
    # remove entries where all values are 0
    v = [col for col in df.columns if col != 'Chemical']
    df = df.loc[~(df[v] == 0).all(axis=1)]
    df_National = pd.DataFrame()
    for compartment, columns in compartments.items():
        df_aux = df[['Chemical'] + columns].reset_index(drop=True)
        for column in columns:
            df_aux[column] = df_aux[column].str.replace(',',
                                                        '').astype('float')
        df_aux['FlowAmount'] = df_aux[columns].sum(axis=1)
        df_aux.rename(columns={'Chemical': 'FlowName'}, inplace=True)
        df_aux['Unit'] = 'Pounds'
        df_aux['Compartment'] = compartment
        df_National = pd.concat([df_National, df_aux],
                                axis=0,
                                ignore_index=True,
                                sort=True)
        del df_aux
    del df
    df_National['FlowAmount'] = df_National['FlowAmount'].round(3)
    df_National = df_National[cols]
    df_National = map_to_fedefl(df_National)
    if df_National is None:
        log.warning('Totals not generated')
        return
    df_National.sort_values(by=['FlowName', 'Compartment'], inplace=True)
    log.info(f'saving TRI_{year}_NationalTotals.csv to {DATA_PATH}')
    df_National.to_csv(DATA_PATH.joinpath(f'TRI_{year}_NationalTotals.csv'),
                       index=False)

    # Update validationSets_Sources.csv
    date_created = time.strptime(time.ctime(filename.stat().st_ctime))
    date_created = time.strftime('%d-%b-%Y', date_created)
    validation_dict = {
        'Inventory':
        'TRI',
        #'Version': '',
        'Year':
        year,
        'Name':
        'TRI Explorer',
        'URL':
        'https://enviro.epa.gov/triexplorer/tri_release.chemical',
        'Criteria':
        'Year, All of United States, All Chemicals, '
        'All Industries, Details:(Other On-Site Disposal or '
        'Other Releases, Other Off-Site Disposal or Other Releases), '
        'mapped to FEDEFL',
        'Date Acquired':
        date_created,
    }
    update_validationsets_sources(validation_dict, date_acquired=True)
예제 #30
0
def main(**kwargs):

    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)

    parser.add_argument('Option',
                        help='What do you want to do:\
                        [A] Download and TRI flat files from TRI Data Plus.\
                        [B] Format national totals for TRI from download \
                        national files.\
                        [C] Generate StEWI inventory files from downloaded files',
                        type=str)

    parser.add_argument('-Y',
                        '--Year',
                        nargs='+',
                        help='What TRI year you want to retrieve',
                        type=str)

    parser.add_argument('-F',
                        '--Files',
                        nargs='+',
                        help='What TRI Files you want (e.g., 1a, 2a, etc).\
                        Check:\
                        https://www.epa.gov/toxics-release-inventory-tri-program/\
                            tri-basic-plus-data-files-guides',
                        default=['1a', '3a'],
                        required=False)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    files = kwargs['Files']

    for year in kwargs['Year']:

        if kwargs['Option'] == 'A':
            log.info('downloading TRI files from source for %s', year)
            tri_url = _config['url']
            if url_is_alive(tri_url):
                link_zip_TRI = link_zip(tri_url, _config['queries'], year)
                extract_TRI_data_files(link_zip_TRI, files, year)
                generate_metadata(year, files, datatype='source')
            else:
                log.error('The URL in config.yaml ({}) for TRI is not '
                          'reachable.'.format(tri_url))

        elif kwargs['Option'] == 'B':
            # Website for National Totals
            # https://enviro.epa.gov/triexplorer/tri_release.chemical
            # Steps:
            # (1) Select Year of Data, All of United States, All Chemicals,
            # All Industry, and other needed option (this is based on the
            # desired year)
            # Columns: check 'Other On-site Disposal or Other Releases' and
            # 'Other Off-site Disposal or Other Releases'
            # (2) Export to CSV
            # (3) Drop the not needed rows, including the extra dioxin row
            # at the bottom
            # (4) Organize the columns as they are needed (check existing files)
            # (5) Save the file like TRI_chem_release_year.csv in data folder
            # (6) Run this code

            generate_national_totals(year)

        elif kwargs['Option'] == 'C':
            log.info(f'generating TRI inventory from files for {year}')
            Generate_TRI_files_csv(year, files)
            generate_metadata(year, files, datatype='inventory')