def eia_facility_fuel_region(year): primary_fuel = eia923_primary_fuel(year=year) ba_match = eia860_balancing_authority(year) primary_fuel["Plant Id"] = primary_fuel["Plant Id"].astype(int) ba_match["Plant Id"] = ba_match["Plant Id"].astype(int) combined = primary_fuel.merge(ba_match, on='Plant Id') combined['primary fuel percent gen'] = ( combined['primary fuel percent gen'] / 100) combined.rename(columns={ 'primary fuel percent gen': 'PercentGenerationfromDesignatedFuelCategory', 'Plant Id': 'FacilityID', 'fuel category': 'FuelCategory', 'NERC Region': 'NERC', }, inplace=True) return combined
def aggregate_data(total_db, subregion="BA"): """ Aggregates facility-level emissions to the specified subregion and calculates emission factors based on the total emission and total electricity generation. Parameters ---------- total_db : dataframe Facility-level emissions as generated by created by create_generation_process_df subregion : str, optional The level of subregion that the data will be aggregated to. Choices are 'all', 'NERC', 'BA', 'US', by default 'BA'. Returns ------- dataframe The dataframe provides the emissions aggregated to the specified subregion for each technology and stage in the input total_db. This dataframe includes an average emission factor and, when applicable uncertainty distributions. """ from electricitylci.aggregation_selector import subregion_col def geometric_mean(p_series, df, cols): # Alternatively we can use scipy.stats.lognorm to fit a distribution # and provide the parameters if (len(p_series) > 3) & (p_series.quantile(0.5) > 0): # result = gmean(p_series.to_numpy()+1)-1 module_logger.debug( f"Calculating confidence interval for" f"{df.loc[p_series.index[0],groupby_cols].values}") module_logger.debug(f"{p_series.values}") with np.errstate(all='raise'): try: data = p_series.to_numpy() except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with input data") return None try: log_data = np.log(data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with log function") return None try: mean = np.mean(log_data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with mean function") return None l = len(data) try: sd = np.std(log_data) / np.sqrt(l) sd2 = sd**2 except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with std function") return None try: pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with t function") return None try: upper_interval = np.max([ mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), ]) except: module_logger.debug("Problem with interval function") return None try: result = (np.exp(mean), 0, np.exp(upper_interval)) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Unable to calculate geometric_mean") return None if result is not None: return result else: module_logger.debug( f"Problem generating uncertainty parameters \n" f"{df.loc[p_series.index[0],groupby_cols].values}\n" f"{p_series.values}" f"{p_series.values+1}") return None else: return None def calc_geom_std(df): if region_agg is not None: debug_string = f"{df[region_agg]}-{df['FuelCategory']}-{df['FlowName']}" else: debug_string = f"{df['FuelCategory']}-{df['FlowName']}" module_logger.debug(debug_string) if df["uncertaintyLognormParams"] is None: return None, None if isinstance(df["uncertaintyLognormParams"], str): params = ast.literal_eval(df["uncertaintyLognormParams"]) try: length = len(df["uncertaintyLognormParams"]) except TypeError: module_logger.info( f"Error calculating length of uncertaintyLognormParams" f"{df['uncertaintyLognormParams']}") return None, None if length != 3: module_logger.info( f"Error estimating standard deviation - length: {len(params)}") else: # In some cases, the final emission factor is far different than the # geometric mean of the individual emission factor. Depending on the # severity, this could be a clear sign of outliers having a large impact # on the final emission factor. When the uncertainty is generated for # these cases, the results can be nonsensical - hence we skip them. A more # agressive approach would be to re-assign the emission factor as well. if df["Emission_factor"] > df["uncertaintyLognormParams"][2]: return None, None else: c = np.log(df["uncertaintyLognormParams"][2]) - np.log( df["Emission_factor"]) b = -2**0.5 * erfinv(2 * 0.95 - 1) a = 0.5 sd1 = (-b + (b**2 - 4 * a * c)**0.5) / (2 * a) sd2 = (-b - (b**2 - 4 * a * c)**0.5) / (2 * a) if sd1 is not float("nan") and sd2 is not float("nan"): if sd1 < sd2: geostd = np.exp(sd1) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd1**2) else: geostd = np.exp(sd2) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd2**2) elif sd1 is not float("nan"): geostd = np.exp(sd1) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd1**2) elif sd2 is not float("nan"): geostd = np.exp(sd2) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd2**2) else: return None, None if ((geostd is np.inf) or (geostd is np.NINF) or (geostd is np.nan) or (geostd is float("nan")) or str(geostd) == "nan" or (geostd == 0)): return None, None return str(geomean), str(geostd) region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: groupby_cols = ( region_agg + fuel_agg + ["stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"]) elec_df_groupby_cols = (region_agg + fuel_agg + ["Year", "source_string"]) else: groupby_cols = fuel_agg + [ "stage_code", "FlowName", "Compartment", "FlowUUID", "Unit" ] elec_df_groupby_cols = fuel_agg + ["Year", "source_string"] if model_specs.replace_egrid: primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year) primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True) primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int) key_df = (primary_fuel_df[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) total_db.loc[total_db["FuelCategory"] != "ALL", "FuelCategory"] = total_db["eGRID_ID"].map( key_df["FuelCategory"]) total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid") total_db = aggregate_facility_flows(total_db) total_db, electricity_df = calculate_electricity_by_source( total_db, subregion) total_db["FlowAmount"].replace(to_replace=0, value=1E-15, inplace=True) total_db = add_data_collection_score(total_db, electricity_df, subregion) total_db["facility_emission_factor"] = (total_db["FlowAmount"] / total_db["Electricity"]) total_db.dropna(subset=["facility_emission_factor"], inplace=True) def wtd_mean(pdser, total_db, cols): try: wts = total_db.loc[pdser.index, "FlowAmount"] result = np.average(pdser, weights=wts) except: module_logger.debug( f"Error calculating weighted mean for {pdser.name}-" f"likely from 0 FlowAmounts" # f"{total_db.loc[pdser.index[0],cols]}" ) try: with np.errstate(all='raise'): result = np.average(pdser) except ArithmeticError or ValueError or FloatingPointError: result = float("nan") return result wm = lambda x: wtd_mean(x, total_db, groupby_cols) geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols) geo_mean.__name__ = "geo_mean" print( "Aggregating flow amounts, dqi information, and calculating uncertainty" ) database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"], as_index=False).agg({ "FlowAmount": ["sum", "count"], "TemporalCorrelation": wm, "TechnologicalCorrelation": wm, "GeographicalCorrelation": wm, "DataCollection": wm, "ReliabilityScore": wm, "facility_emission_factor": ["min", "max", geo_mean], }) database_f3.columns = groupby_cols + [ "Year", "source_string", "FlowAmount", "FlowAmountCount", "TemporalCorrelation", "TechnologicalCorrelation", "GeographicalCorrelation", "DataCollection", "ReliabilityScore", "uncertaintyMin", "uncertaintyMax", "uncertaintyLognormParams", ] criteria = database_f3["Compartment"] == "input" database_f3.loc[criteria, "uncertaintyLognormParams"] = None database_f3 = database_f3.merge( right=electricity_df, left_on=elec_df_groupby_cols, right_on=elec_df_groupby_cols, how="left", ) canadian_criteria = database_f3["FuelCategory"] == "ALL" if region_agg: canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_db[groupby_cols + ["Electricity"]], left_on=groupby_cols, right_on=groupby_cols, how="left", ).drop_duplicates(subset=groupby_cols) else: total_grouped = total_db.groupby(by=groupby_cols, as_index=False)["Electricity"].sum() canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_grouped, left_on=groupby_cols, right_on=groupby_cols, how="left", ) canada_db.index = database_f3.loc[canadian_criteria, :].index database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid", "FlowUUID"] = float("nan") database_f3.loc[canada_db.index, "electricity_sum"] = canada_db["Electricity"] database_f3["Emission_factor"] = (database_f3["FlowAmount"] / database_f3["electricity_sum"]) # Infinite values generally coming from places with 0 generation. This happens # particularly with the Canadian mixes. database_f3["Emission_factor"].replace(to_replace=float("inf"), value=0, inplace=True) database_f3["Emission_factor"].replace(to_replace=float("-inf"), value=0, inplace=True) if region_agg is not None: database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[ "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin", "uncertaintyMax", "FuelCategory", "FlowName" ] + region_agg].apply(calc_geom_std, axis=1)) else: database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[ "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin", "uncertaintyMax", "FuelCategory", "FlowName" ]].apply(calc_geom_std, axis=1)) database_f3.sort_values(by=groupby_cols, inplace=True) return database_f3
def create_generation_process_df(): """ Reads emissions and generation data from different sources to provide facility-level emissions. Most important inputs to this process come from the model configuration file. Parameters ---------- None Returns ---------- dataframe Datafrane includes all facility-level emissions """ from electricitylci.eia923_generation import (build_generation_data, eia923_primary_fuel) from electricitylci.egrid_filter import ( egrid_facilities_to_include, emissions_and_waste_for_selected_egrid_facilities, ) from electricitylci.generation import ( egrid_facilities_w_fuel_region, add_technological_correlation_score, add_temporal_correlation_score, ) import electricitylci.emissions_other_sources as em_other import electricitylci.ampd_plant_emissions as ampd from electricitylci.combinator import ba_codes import electricitylci.manual_edits as edits COMPARTMENT_DICT = { "emission/air": "air", "emission/water": "water", "emission/ground": "ground", "input": "input", "output": "output", "waste": "waste", "air": "air", "water": "water", "ground": "ground", } if model_specs.replace_egrid: generation_data = build_generation_data().drop_duplicates() cems_df = ampd.generate_plant_emissions(model_specs.eia_gen_year) cems_df.drop(columns=["FlowUUID"], inplace=True) emissions_and_waste_for_selected_egrid_facilities = em_other.integrate_replace_emissions( cems_df, emissions_and_waste_for_selected_egrid_facilities) else: from electricitylci.egrid_filter import electricity_for_selected_egrid_facilities generation_data = electricity_for_selected_egrid_facilities generation_data["Year"] = model_specs.egrid_year generation_data["FacilityID"] = generation_data["FacilityID"].astype( int) # generation_data = build_generation_data( # egrid_facilities_to_include=egrid_facilities_to_include # ) emissions_and_waste_for_selected_egrid_facilities.drop( columns=["FacilityID"]) emissions_and_waste_for_selected_egrid_facilities[ "eGRID_ID"] = emissions_and_waste_for_selected_egrid_facilities[ "eGRID_ID"].astype(int) final_database = pd.merge( left=emissions_and_waste_for_selected_egrid_facilities, right=generation_data, right_on=["FacilityID", "Year"], left_on=["eGRID_ID", "Year"], how="left", ) egrid_facilities_w_fuel_region[ "FacilityID"] = egrid_facilities_w_fuel_region["FacilityID"].astype( int) final_database = pd.merge( left=final_database, right=egrid_facilities_w_fuel_region, left_on="eGRID_ID", right_on="FacilityID", how="left", suffixes=["", "_right"], ) if model_specs.replace_egrid: primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year) primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True) primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int) key_df = (primary_fuel_df[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) final_database["FuelCategory"] = final_database["eGRID_ID"].map( key_df["FuelCategory"]) else: key_df = (final_database[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) final_database.loc[final_database["FuelCategory"].isnull(), "FuelCategory"] = final_database.loc[ final_database["FuelCategory"].isnull(), "eGRID_ID"].map(key_df["FuelCategory"]) # if replace_egrid: # final_database["FuelCategory"].fillna( # final_database["FuelCategory_right"], inplace=True # ) final_database["Final_fuel_agg"] = final_database["FuelCategory"] # if model_specs.use_primaryfuel_for_coal: # final_database.loc[ # final_database["FuelCategory"] == "COAL", ["Final_fuel_agg"] # ] = final_database.loc[ # final_database["FuelCategory"] == "COAL", "PrimaryFuel" # ] try: year_filter = final_database["Year_x"] == final_database["Year_y"] final_database = final_database.loc[year_filter, :] final_database.drop(columns="Year_y", inplace=True) except KeyError: pass final_database.rename(columns={"Year_x": "Year"}, inplace=True) final_database = map_emissions_to_fedelemflows(final_database) dup_cols_check = [ "FacilityID", "FuelCategory", "FlowName", "FlowAmount", "Compartment", ] final_database = final_database.loc[:, ~final_database.columns.duplicated()] final_database = final_database.drop_duplicates(subset=dup_cols_check) final_database.drop( columns=["FuelCategory", "FacilityID_x", "FacilityID_y"], inplace=True) final_database.rename( columns={ "Final_fuel_agg": "FuelCategory", "TargetFlowUUID": "FlowUUID", }, inplace=True, ) final_database = add_temporal_correlation_score( final_database, model_specs.electricity_lci_target_year) final_database = add_technological_correlation_score(final_database) final_database["DataCollection"] = 5 final_database["GeographicalCorrelation"] = 1 final_database["eGRID_ID"] = final_database["eGRID_ID"].astype(int) final_database.sort_values(by=["eGRID_ID", "Compartment", "FlowName"], inplace=True) final_database["stage_code"] = "Power plant" final_database["Compartment_path"] = final_database["Compartment"] final_database["Compartment"] = final_database["Compartment_path"].map( COMPARTMENT_DICT) final_database["Balancing Authority Name"] = final_database[ "Balancing Authority Code"].map(ba_codes["BA_Name"]) final_database["EIA_Region"] = final_database[ "Balancing Authority Code"].map(ba_codes["EIA_Region"]) final_database["FERC_Region"] = final_database[ "Balancing Authority Code"].map(ba_codes["FERC_Region"]) final_database = edits.check_for_edits(final_database, "generation.py", "create_generation_process_df") return final_database