def combine_gen_emissions_data(generation_data, emissions_data, subregion=None): """ Merge generation and emissions data. Add region designations using either eGRID or EIA-860. Same for primary fuel by plant (eGRID or 923). Calculate and merge in the total generation by region. Create the column "Subregion" to hold regional name info. Remove electricity flows. Rename flows and add UUIDs according to the federal flow list. Parameters ---------- generation_data : dataframe Annual generation for each power plant. Contains the plant ID, generation amount, and year. emissions_data : dataframe Annual emissions of all flows from each facility. Probably compiled in the stewi module and loaded from a csv file. subregion : str MAY BE DEPRECIATED. Description of the region type or single region. If the config parameter 'region_column_name' is not false this parameter is ignored (the default value is None, which triggers the use of model config value). Returns ------- dataframe Combined emissions and generation data for each facility """ if subregion is None: subregion = regional_aggregation emissions_data = emissions_data.drop(columns=['FacilityID']) generation_data["FacilityID"] = generation_data["FacilityID"].astype(int) emissions_data["eGRID_ID"] = emissions_data["eGRID_ID"].astype(int) combined_data = generation_data.merge(emissions_data, left_on=['FacilityID', 'Year'], right_on=['eGRID_ID', 'Year'], how='right') # # Checking the odd year to determine if emissions are from a year other than # # generation - need to normalize emissions data with generation from the # # corresponding data. # odd_year = None # for year in years_in_emissions_and_wastes_by_facility: # if year != egrid_year: # odd_year = year; # #Code below not being used # #checking if any of the years are odd. If yes, we need EIA data. # #non_egrid_emissions_odd_year = combined_data[combined_data['Year'] == odd_year] # #odd_database = pd.unique(non_egrid_emissions_odd_year['Source']) cols_to_drop_for_final = ['FacilityID'] # #Downloading the required EIA923 data # # Annual facility generation from the same year as the emissions data # # is needed to normalize total facility emissions. # if odd_year != None: # EIA_923_gen_data = eia_download_extract(odd_year) # #Merging database with EIA 923 data # combined_data = combined_data.merge(EIA_923_gen_data, left_on = ['eGRID_ID'],right_on = ['Plant Id'],how = 'left') # combined_data['Year'] = combined_data['Year'].astype(str) # combined_data = combined_data.sort_values(by = ['Year']) # #Replacing the odd year Net generations with the EIA net generations. # combined_data['Electricity']= np.where(combined_data['Year'] == int(odd_year), combined_data['Net Generation (Megawatthours)'],combined_data['Electricity']) # cols_to_drop_for_final = cols_to_drop_for_final+['Plant Id','Plant Name','State','YEAR','Net Generation (Megawatthours)','Total Fuel Consumption MMBtu'] #Dropping unnecessary columns emissions_gen_data = combined_data.drop(columns=cols_to_drop_for_final) emissions_gen_data["eGRID_ID"] = emissions_gen_data["eGRID_ID"].astype(int) if replace_egrid: year = eia_gen_year # This will only add BA labels, not eGRID subregions fuel_region = eia_facility_fuel_region(year) fuel_region["FacilityID"] = fuel_region["FacilityID"].astype(int) final_data = pd.merge(fuel_region, emissions_gen_data, left_on=['FacilityID'], right_on=['eGRID_ID'], how='right') else: #Merging with the egrid_facilites file to get the subregion information in the database!!! egrid_facilities_w_fuel_region[ "FacilityID"] = egrid_facilities_w_fuel_region[ "FacilityID"].astype(int) final_data = pd.merge(egrid_facilities_w_fuel_region, emissions_gen_data, left_on=['FacilityID'], right_on=['eGRID_ID'], how='right') #Add in reference electricity for subregion and fuel category if not replace_egrid: final_data = pd.merge(final_data, ref_egrid_subregion_generation_by_fuelcategory, on=['Subregion', 'FuelCategory'], how='left') if replace_egrid is True: # Subregion shows up all over the place below. If not using egrid # sub in the BA name because we don't have the eGRID subregion. if subregion: assert subregion in final_data.columns final_data['Subregion'] = final_data[subregion] else: final_data['Subregion'] = final_data['Balancing Authority Name'] subregion_fuel_year_gen = (final_data.groupby( ['Subregion', 'FuelCategory', 'Year'], as_index=False)['Electricity'].sum()) subregion_fuel_year_gen.rename( columns={'Electricity': 'Ref_Electricity_Subregion_FuelCategory'}, inplace=True) final_data = pd.merge(final_data, subregion_fuel_year_gen, on=['Subregion', 'FuelCategory', 'Year']) # Need to drop rows with NaN electricity generation # They currently exist when generation from a facility has been omitted # because of some filter (e.g. generation from pirmary fuel < 90%) # but we still have emissions data. final_data.dropna(subset=['Electricity'], inplace=True) if subregion: try: regions = final_data[subregion].unique() except KeyError: print( f"Configuration file specifes region column as {subregion}, but it does not exist" ) if subregion == 'eGRID': regions = egrid_subregions elif subregion == 'NERC': regions = list(pd.unique(final_data['NERC'])) elif subregion == 'BA': regions = list( pd.unique(final_data['Balancing Authority Name'])) else: regions = [subregion] elif subregion == 'eGRID': regions = egrid_subregions elif subregion == 'NERC': regions = list(pd.unique(final_data['NERC'])) elif subregion == 'BA': regions = list(pd.unique(final_data['Balancing Authority Name'])) else: regions = [subregion] #final_data.to_excel('Main_file.xlsx') final_data = final_data.drop(columns=['FacilityID']) #THIS CHECK AND STAMENT IS BEING PUT BECAUSE OF SAME FLOW VALUE ERROR STILL BEING THERE IN THE DATA dup_cols_check = [ 'Subregion', 'PrimaryFuel', 'FuelCategory', 'FlowName', 'FlowAmount', 'Compartment', ] final_data = final_data.drop_duplicates(subset=dup_cols_check) final_data = final_data[final_data['FlowName'] != 'Electricity'] # Map emission flows to fed elem flows final_database = map_emissions_to_fedelemflows(final_data) return final_database #, regions
def create_generation_process_df(): """ Reads emissions and generation data from different sources to provide facility-level emissions. Most important inputs to this process come from the model configuration file. Parameters ---------- None Returns ---------- dataframe Datafrane includes all facility-level emissions """ from electricitylci.eia923_generation import (build_generation_data, eia923_primary_fuel) from electricitylci.egrid_filter import ( egrid_facilities_to_include, emissions_and_waste_for_selected_egrid_facilities, ) from electricitylci.generation import ( egrid_facilities_w_fuel_region, add_technological_correlation_score, add_temporal_correlation_score, ) import electricitylci.emissions_other_sources as em_other import electricitylci.ampd_plant_emissions as ampd from electricitylci.combinator import ba_codes import electricitylci.manual_edits as edits COMPARTMENT_DICT = { "emission/air": "air", "emission/water": "water", "emission/ground": "ground", "input": "input", "output": "output", "waste": "waste", "air": "air", "water": "water", "ground": "ground", } if model_specs.replace_egrid: generation_data = build_generation_data().drop_duplicates() cems_df = ampd.generate_plant_emissions(model_specs.eia_gen_year) cems_df.drop(columns=["FlowUUID"], inplace=True) emissions_and_waste_for_selected_egrid_facilities = em_other.integrate_replace_emissions( cems_df, emissions_and_waste_for_selected_egrid_facilities) else: from electricitylci.egrid_filter import electricity_for_selected_egrid_facilities generation_data = electricity_for_selected_egrid_facilities generation_data["Year"] = model_specs.egrid_year generation_data["FacilityID"] = generation_data["FacilityID"].astype( int) # generation_data = build_generation_data( # egrid_facilities_to_include=egrid_facilities_to_include # ) emissions_and_waste_for_selected_egrid_facilities.drop( columns=["FacilityID"]) emissions_and_waste_for_selected_egrid_facilities[ "eGRID_ID"] = emissions_and_waste_for_selected_egrid_facilities[ "eGRID_ID"].astype(int) final_database = pd.merge( left=emissions_and_waste_for_selected_egrid_facilities, right=generation_data, right_on=["FacilityID", "Year"], left_on=["eGRID_ID", "Year"], how="left", ) egrid_facilities_w_fuel_region[ "FacilityID"] = egrid_facilities_w_fuel_region["FacilityID"].astype( int) final_database = pd.merge( left=final_database, right=egrid_facilities_w_fuel_region, left_on="eGRID_ID", right_on="FacilityID", how="left", suffixes=["", "_right"], ) if model_specs.replace_egrid: primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year) primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True) primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int) key_df = (primary_fuel_df[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) final_database["FuelCategory"] = final_database["eGRID_ID"].map( key_df["FuelCategory"]) else: key_df = (final_database[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) final_database.loc[final_database["FuelCategory"].isnull(), "FuelCategory"] = final_database.loc[ final_database["FuelCategory"].isnull(), "eGRID_ID"].map(key_df["FuelCategory"]) # if replace_egrid: # final_database["FuelCategory"].fillna( # final_database["FuelCategory_right"], inplace=True # ) final_database["Final_fuel_agg"] = final_database["FuelCategory"] # if model_specs.use_primaryfuel_for_coal: # final_database.loc[ # final_database["FuelCategory"] == "COAL", ["Final_fuel_agg"] # ] = final_database.loc[ # final_database["FuelCategory"] == "COAL", "PrimaryFuel" # ] try: year_filter = final_database["Year_x"] == final_database["Year_y"] final_database = final_database.loc[year_filter, :] final_database.drop(columns="Year_y", inplace=True) except KeyError: pass final_database.rename(columns={"Year_x": "Year"}, inplace=True) final_database = map_emissions_to_fedelemflows(final_database) dup_cols_check = [ "FacilityID", "FuelCategory", "FlowName", "FlowAmount", "Compartment", ] final_database = final_database.loc[:, ~final_database.columns.duplicated()] final_database = final_database.drop_duplicates(subset=dup_cols_check) final_database.drop( columns=["FuelCategory", "FacilityID_x", "FacilityID_y"], inplace=True) final_database.rename( columns={ "Final_fuel_agg": "FuelCategory", "TargetFlowUUID": "FlowUUID", }, inplace=True, ) final_database = add_temporal_correlation_score( final_database, model_specs.electricity_lci_target_year) final_database = add_technological_correlation_score(final_database) final_database["DataCollection"] = 5 final_database["GeographicalCorrelation"] = 1 final_database["eGRID_ID"] = final_database["eGRID_ID"].astype(int) final_database.sort_values(by=["eGRID_ID", "Compartment", "FlowName"], inplace=True) final_database["stage_code"] = "Power plant" final_database["Compartment_path"] = final_database["Compartment"] final_database["Compartment"] = final_database["Compartment_path"].map( COMPARTMENT_DICT) final_database["Balancing Authority Name"] = final_database[ "Balancing Authority Code"].map(ba_codes["BA_Name"]) final_database["EIA_Region"] = final_database[ "Balancing Authority Code"].map(ba_codes["EIA_Region"]) final_database["FERC_Region"] = final_database[ "Balancing Authority Code"].map(ba_codes["FERC_Region"]) final_database = edits.check_for_edits(final_database, "generation.py", "create_generation_process_df") return final_database
def create_generation_process_df(generation_data, emissions_data, subregion): emissions_data = emissions_data.drop(columns=['FacilityID']) combined_data = generation_data.merge(emissions_data, left_on=['FacilityID'], right_on=['eGRID_ID'], how='right') #Checking the odd year odd_year = None for year in years_in_emissions_and_wastes_by_facility: if year != egrid_year: odd_year = year #Code below not being used #checking if any of the years are odd. If yes, we need EIA data. #non_egrid_emissions_odd_year = combined_data[combined_data['Year'] == odd_year] #odd_database = pd.unique(non_egrid_emissions_odd_year['Source']) cols_to_drop_for_final = ['FacilityID'] #Downloading the required EIA923 data if odd_year != None: EIA_923_gen_data = eia_download_extract(odd_year) #Merging database with EIA 923 data combined_data = combined_data.merge(EIA_923_gen_data, left_on=['eGRID_ID'], right_on=['Plant Id'], how='left') combined_data['Year'] = combined_data['Year'].astype(str) combined_data = combined_data.sort_values(by=['Year']) #Replacing the odd year Net generations with the EIA net generations. combined_data['Electricity'] = np.where( combined_data['Year'] == int(odd_year), combined_data['Net Generation (Megawatthours)'], combined_data['Electricity']) cols_to_drop_for_final = cols_to_drop_for_final + [ 'Plant Id', 'Plant Name', 'State', 'YEAR', 'Net Generation (Megawatthours)', 'Total Fuel Consumption MMBtu' ] #Dropping unnecessary columns emissions_gen_data = combined_data.drop(columns=cols_to_drop_for_final) #Merging with the egrid_facilites file to get the subregion information in the database!!! final_data = pd.merge(egrid_facilities_w_fuel_region, emissions_gen_data, left_on=['FacilityID'], right_on=['eGRID_ID'], how='right') #Add in reference electricity for subregion and fuel category final_data = pd.merge(final_data, ref_egrid_subregion_generation_by_fuelcategory, on=['Subregion', 'FuelCategory'], how='left') #store the total elci data in a csv file just for checking #final_data.to_excel('elci_summary.xlsx') if subregion == 'all': regions = egrid_subregions elif subregion == 'NERC': regions = list(pd.unique(final_data['NERC'])) elif subregion == 'BA': regions = list(pd.unique(final_data['Balancing Authority Name'])) else: regions = [subregion] #final_data.to_excel('Main_file.xlsx') final_data = final_data.drop(columns=['FacilityID']) #THIS CHECK AND STAMENT IS BEING PUT BECAUSE OF SAME FLOW VALUE ERROR STILL BEING THERE IN THE DATA final_data = final_data.drop_duplicates(subset=[ 'Subregion', 'PrimaryFuel', 'FuelCategory', 'FlowName', 'FlowAmount', 'Compartment' ]) final_data = final_data[final_data['FlowName'] != 'Electricity'] # Map emission flows to fed elem flows final_database = map_emissions_to_fedelemflows(final_data) # Create dfs for storing the output result_database = pd.DataFrame() total_gen_database = pd.DataFrame() # Looping through different subregions to create the files for reg in regions: print("Creating generation process database for " + reg + " ...") # Cropping out based on regions if subregion == 'all': database = final_database[final_database['Subregion'] == reg] elif subregion == 'NERC': database = final_database[final_database['NERC'] == reg] elif subregion == 'BA': database = final_database[ final_database['Balancing Authority Name'] == reg] elif subregion == 'US': # For entire US use full database database = final_database else: # This should be a egrid subregion database = final_database[final_database['Subregion'] == reg] for index, row in fuel_name.iterrows(): # Reading complete fuel name and heat content information fuelname = row['FuelList'] fuelheat = float(row['Heatcontent']) # croppping the database according to the current fuel being considered database_f1 = database[database['FuelCategory'] == fuelname] if database_f1.empty == True: database_f1 = database[database['PrimaryFuel'] == fuelname] if database_f1.empty != True: database_f1 = database_f1.sort_values(by='Source', ascending=False) exchange_list = list(pd.unique(database_f1['FlowName'])) if use_primaryfuel_for_coal: database_f1['FuelCategory'].loc[ database_f1['FuelCategory'] == 'COAL'] = database_f1['PrimaryFuel'] for exchange in exchange_list: database_f2 = database_f1[database_f1['FlowName'] == exchange] database_f2 = database_f2[[ 'Subregion', 'FuelCategory', 'PrimaryFuel', 'eGRID_ID', 'Electricity', 'FlowName', 'FlowAmount', 'FlowUUID', 'Compartment', 'Year', 'Source', 'ReliabilityScore', 'Unit', 'NERC', 'PercentGenerationfromDesignatedFuelCategory', 'Balancing Authority Name', 'ElementaryFlowPrimeContext', 'Balancing Authority Code', 'Ref_Electricity_Subregion_FuelCategory' ]] compartment_list = list( pd.unique(database_f2['Compartment'])) for compartment in compartment_list: database_f3 = database_f2[database_f2['Compartment'] == compartment] database_f3 = database_f3.drop_duplicates(subset=[ 'Subregion', 'FuelCategory', 'PrimaryFuel', 'eGRID_ID', 'Electricity', 'FlowName', 'Compartment', 'Year', 'Unit' ]) sources = list(pd.unique(database_f3['Source'])) # if len(sources) >1: # print('Error occured. Duplicate emissions from Different source. Writing an error file error.csv') # database_f3.to_csv(output_dir+'error'+reg+fuelname+exchange+'.csv') # Get electricity relevant for this exchange for the denominator in the emissions factors calcs electricity_source_by_facility_for_region_fuel = database_f1[ ['eGRID_ID', 'Electricity', 'Source']].drop_duplicates() total_gen, mean, total_facility_considered = total_generation_calculator( sources, electricity_source_by_facility_for_region_fuel) # Add data quality scores database_f3 = add_flow_representativeness_data_quality_scores( database_f3, total_gen) # Can now drop this database_f3 = database_f3.drop( columns='Ref_Electricity_Subregion_FuelCategory') # Add scores for regions to sources_str = join_with_underscore(sources) exchange_total_gen = pd.DataFrame( [[ reg, fuelname, exchange, compartment, sources_str, total_gen ]], columns=[ 'Subregion', 'FuelCategory', 'FlowName', 'Compartment', 'Source', 'Total Generation' ]) total_gen_database = total_gen_database.append( exchange_total_gen, ignore_index=True) if exchange == 'Heat' and str(fuelheat) != 'nan': # Getting Emisssion_factor database_f3['Emission_factor'] = compilation( database_f3[['Electricity', 'FlowAmount']], total_gen) / fuelheat database_f3['Unit'] = 'kg' else: database_f3['Emission_factor'] = compilation( database_f3[['Electricity', 'FlowAmount']], total_gen) # Data Quality Scores database_f3['GeographicalCorrelation'] = 1 #If flow amount sum = 0, then do not average if sum(database_f3['FlowAmount']) != 0: database_f3['Reliability_Score'] = np.average( database_f3['ReliabilityScore'], weights=database_f3['FlowAmount']) database_f3['TemporalCorrelation'] = np.average( database_f3['TemporalCorrelation'], weights=database_f3['FlowAmount']) database_f3[ 'TechnologicalCorrelation'] = np.average( database_f3['TechnologicalCorrelation'], weights=database_f3['FlowAmount']) database_f3['DataCollection'] = np.average( database_f3['DataCollection'], weights=database_f3['FlowAmount']) # Uncertainty Calcs uncertainty_info = uncertainty_creation( database_f3[['Electricity', 'FlowAmount']], exchange, fuelheat, mean, total_gen, total_facility_considered) database_f3['GeomMean'] = uncertainty_info['geomMean'] database_f3['GeomSD'] = uncertainty_info['geomSd'] database_f3['Maximum'] = uncertainty_info['maximum'] database_f3['Minimum'] = uncertainty_info['minimum'] database_f3['Source'] = sources_str # Optionally write out electricity # database_f3['Electricity'] = total_gen frames = [result_database, database_f3] result_database = pd.concat(frames) if subregion == 'all': result_database = result_database.drop(columns=[ 'eGRID_ID', 'FlowAmount', 'Electricity', 'ReliabilityScore', 'PrimaryFuel', 'NERC', 'Balancing Authority Name', 'Balancing Authority Code' ]) elif subregion == 'NERC': result_database = result_database.drop(columns=[ 'eGRID_ID', 'FlowAmount', 'Electricity', 'ReliabilityScore', 'PrimaryFuel', 'Balancing Authority Name', 'Balancing Authority Code', 'Subregion' ]) elif subregion == 'BA': result_database = result_database.drop(columns=[ 'eGRID_ID', 'FlowAmount', 'Electricity', 'ReliabilityScore', 'PrimaryFuel', 'NERC', 'Balancing Authority Code', 'Subregion' ]) elif subregion == 'US': result_database = result_database.drop(columns=[ 'eGRID_ID', 'FlowAmount', 'Electricity', 'ReliabilityScore', 'PrimaryFuel', 'NERC', 'Balancing Authority Name', 'Balancing Authority Code', 'Subregion' ]) result_database = result_database.drop_duplicates() # Drop duplicated in total gen database #total_gen_database = total_gen_database.drop_duplicates() print("Generation process database for " + subregion + " complete.") return result_database return b