Пример #1
0
def add_data_collection_score(db, elec_df, subregion="BA"):
    """
    Adds the data collection score which is a function of how much of the
    total electricity generated in a subregion is captured by the denominator
    used in the final emission factor.

    Parameters
    ----------
    db : datafrane
        Dataframe containing facility-level emissions as generated by
        create_generation_process_df.
    elec_df : dataframe
        Dataframe containing the totals for various subregion/source
        combinations. These are used as the denominators in the emissions
        factors
    subregion : str, optional
        The level of subregion that the data will be aggregated to. Choices
        are 'all', 'NERC', 'BA', 'US', by default 'BA'
    """
    from electricitylci.dqi import data_collection_lower_bound_to_dqi
    from electricitylci.aggregation_selector import subregion_col

    region_agg = subregion_col(subregion)
    fuel_agg = ["FuelCategory"]
    if region_agg:
        groupby_cols = region_agg + fuel_agg + ["Year"]
    else:
        groupby_cols = fuel_agg + ["Year"]
    temp_df = db.merge(
        right=elec_df,
        left_on=groupby_cols + ["source_string"],
        right_on=groupby_cols + ["source_string"],
        how="left",
    )
    reduced_db = db.drop_duplicates(subset=groupby_cols + ["eGRID_ID"])
    region_elec = reduced_db.groupby(groupby_cols,
                                     as_index=False)["Electricity"].sum()
    region_elec.rename(columns={"Electricity": "region_fuel_electricity"},
                       inplace=True)
    temp_df = temp_df.merge(
        right=region_elec,
        left_on=groupby_cols,
        right_on=groupby_cols,
        how="left",
    )
    db["Percent_of_Gen_in_EF_Denominator"] = (
        temp_df["electricity_sum"] / temp_df["region_fuel_electricity"])
    db["DataCollection"] = db["Percent_of_Gen_in_EF_Denominator"].apply(
        lambda x: lookup_score_with_bound_key(
            x, data_collection_lower_bound_to_dqi))
    db = db.drop(columns="Percent_of_Gen_in_EF_Denominator")
    return db
Пример #2
0
def olcaschema_genprocess(database, upstream_dict={}, subregion="BA"):
    """Turns the give database containing generator facility emissions
    into dictionaries that contain the required data for insertion into
    an openLCA-compatible json-ld. Additionally, default providers
    for fuel inputs are mapped, using the information contained in the dictionary
    containing openLCA-formatted data for the fuels.

    Parameters
    ----------
    database : dataframe
        Dataframe containing aggregated emissions to be turned into openLCA
        unit processes
    upstream_dict : dictionary, optional
        Dictionary as created by upstream_dict.py, containing the openLCA
        formatted data for all of the fuel inputs. This function will use the
        names and UUIDs from the entries to assign them as default providers.
    subregion : str, optional
        The subregion level of the aggregated data, by default "BA". See
        aggregation_selector.py for available subregions.

    Returns
    -------
    dictionary: dictionary contaning openLCA-formatted data
    """
    from electricitylci.process_dictionary_writer import (
        unit,
        flow_table_creation,
        ref_exchange_creator,
        uncertainty_table_creation,
        process_doc_creation,
    )

    from electricitylci.aggregation_selector import subregion_col

    region_agg = subregion_col(subregion)
    fuel_agg = ["FuelCategory"]
    if region_agg:
        base_cols = region_agg + fuel_agg
    else:
        base_cols = fuel_agg
    non_agg_cols = [
        "stage_code",
        "FlowName",
        "FlowUUID",
        "Compartment",
        "Unit",
        "Year",
        "source_string",
        "TemporalCorrelation",
        "TechnologicalCorrelation",
        "GeographicalCorrelation",
        "DataCollection",
        "ReliabilityScore",
        "uncertaintyMin",
        "uncertaintyMax",
        "uncertaintyLognormParams",
        "Emission_factor",
        "GeomMean",
        "GeomSD",
    ]

    def turn_data_to_dict(data, upstream_dict):

        module_logger.debug(
            f"Turning flows from {data.name} into dictionaries")
        cols_for_exchange_dict = [
            "internalId",
            "@type",
            "avoidedProduct",
            "flow",
            "flowProperty",
            "input",
            "quantitativeReference",
            "baseUncertainty",
            "provider",
            "amount",
            "amountFormula",
            "unit",
            "pedigreeUncertainty",
            "dqEntry",
            "uncertainty",
            "comment",
        ]
        year = ",".join(data["Year"].astype(str).unique())
        datasources = ",".join(data["source_string"].astype(str).unique())
        data["Maximum"] = data["uncertaintyMax"]
        data["Minimum"] = data["uncertaintyMin"]
        data["uncertainty"] = ""
        data["internalId"] = ""
        data["@type"] = "Exchange"
        data["avoidedProduct"] = False
        data["flowProperty"] = ""
        data["input"] = False
        input_filter = (
            (data["Compartment"].str.lower().str.contains("input"))
            | (data["Compartment"].str.lower().str.contains("resource"))
            | (data["Compartment"].str.lower().str.contains("technosphere")))
        data.loc[input_filter, "input"] = True
        data["baseUncertainty"] = ""
        data["provider"] = ""
        data["unit"] = data["Unit"]
        #        data["ElementaryFlowPrimeContext"] = data["Compartment"]
        #        default_unit = unit("kg")
        #        data["unit"] = [default_unit] * len(data)
        data["FlowType"] = "ELEMENTARY_FLOW"
        product_filter = (
            (data["Compartment"].str.lower().str.contains("technosphere"))
            | (data["Compartment"].str.lower().str.contains("valuable")))
        data.loc[product_filter, "FlowType"] = "PRODUCT_FLOW"
        waste_filter = ((
            data["Compartment"].str.lower().str.contains("technosphere")))
        data.loc[waste_filter, "FlowType"] = "WASTE_FLOW"
        data["flow"] = ""
        provider_filter = data["stage_code"].isin(upstream_dict.keys())
        for index, row in data.loc[provider_filter, :].iterrows():
            provider_dict = {
                "name":
                upstream_dict[getattr(row, "stage_code")]["name"],
                "categoryPath":
                upstream_dict[getattr(row, "stage_code")]["category"],
                "processType":
                "UNIT_PROCESS",
                "@id":
                upstream_dict[getattr(row, "stage_code")]["uuid"],
            }
            data.at[index, "provider"] = provider_dict
            data.at[index, "unit"] = unit(upstream_dict[getattr(
                row, "stage_code")]["q_reference_unit"])
            data.at[index, "FlowType"] = "PRODUCT_FLOW"
        for index, row in data.iterrows():
            data.at[index, "uncertainty"] = uncertainty_table_creation(
                data.loc[index:index, :])
            data.at[index,
                    "flow"] = flow_table_creation(data.loc[index:index, :])
        data["amount"] = data["Emission_factor"]
        data["amountFormula"] = ""
        data["quantitativeReference"] = False
        data["dqEntry"] = (
            "(" + str(round(data["ReliabilityScore"].iloc[0], 1)) + ";" +
            str(round(data["TemporalCorrelation"].iloc[0], 1)) + ";" +
            str(round(data["GeographicalCorrelation"].iloc[0], 1)) + ";" +
            str(round(data["TechnologicalCorrelation"].iloc[0], 1)) + ";" +
            str(round(data["DataCollection"].iloc[0], 1)) + ")")
        data["pedigreeUncertainty"] = ""
        data["comment"] = f"{datasources} - {year}"
        data_for_dict = data[cols_for_exchange_dict]
        data_for_dict = data_for_dict.append(ref_exchange_creator(),
                                             ignore_index=True)
        data_dict = data_for_dict.to_dict("records")
        return data_dict

    database_groupby = database.groupby(by=base_cols)
    process_df = pd.DataFrame(database_groupby[non_agg_cols].apply(
        turn_data_to_dict, (upstream_dict)))
    process_df.columns = ["exchanges"]
    process_df.reset_index(inplace=True)
    process_df["@type"] = "Process"
    process_df["allocationFactors"] = ""
    process_df["defaultAllocationMethod"] = ""
    process_df["location"] = ""
    process_df["parameters"] = ""
    #    process_doc_dict = process_doc_creation(process_type)
    #    process_df["processDocumentation"] = [process_doc_dict]*len(process_df)
    process_df["processType"] = "UNIT_PROCESS"
    process_df["category"] = (
        "22: Utilities/2211: Electric Power Generation, Transmission and Distribution/"
        + process_df[fuel_agg].values)
    if region_agg is None:
        process_df["description"] = (
            "Electricity from " + process_df[fuel_agg].values +
            " produced at generating facilities in the US.")
        process_df["name"] = ("Electricity - " + process_df[fuel_agg].values +
                              " - US")
    else:
        process_df["description"] = (
            "Electricity from " + process_df[fuel_agg].values +
            " produced at generating facilities in the " +
            process_df[region_agg].values + " region.")
        process_df["name"] = ("Electricity - " + process_df[fuel_agg].values +
                              " - " + process_df[region_agg].values)
    process_df["description"] = (
        process_df["description"] +
        " This process was created with ElectricityLCI " +
        "(https://github.com/USEPA/ElectricityLCI) version " + elci_version +
        " using the " + model_specs.model_name + " configuration.")
    process_df["version"] = make_valid_version_num(elci_version)
    process_df["processDocumentation"] = [
        process_doc_creation(x)
        for x in list(process_df["FuelCategory"].str.lower())
    ]
    process_cols = [
        "@type",
        "allocationFactors",
        "defaultAllocationMethod",
        "exchanges",
        "location",
        "parameters",
        "processDocumentation",
        "processType",
        "name",
        "version",
        "category",
        "description",
    ]
    result = process_df[process_cols].to_dict("index")
    return result
Пример #3
0
def aggregate_data(total_db, subregion="BA"):
    """
    Aggregates facility-level emissions to the specified subregion and
    calculates emission factors based on the total emission and total
    electricity generation.

    Parameters
    ----------
    total_db : dataframe
        Facility-level emissions as generated by created by
        create_generation_process_df
    subregion : str, optional
        The level of subregion that the data will be aggregated to. Choices
        are 'all', 'NERC', 'BA', 'US', by default 'BA'.

    Returns
    -------
    dataframe
        The dataframe provides the emissions aggregated to the specified
        subregion for each technology and stage in the input total_db. This
        dataframe includes an average emission factor and, when applicable
        uncertainty distributions.
    """
    from electricitylci.aggregation_selector import subregion_col

    def geometric_mean(p_series, df, cols):
        # Alternatively we can use scipy.stats.lognorm to fit a distribution
        # and provide the parameters
        if (len(p_series) > 3) & (p_series.quantile(0.5) > 0):
            # result = gmean(p_series.to_numpy()+1)-1
            module_logger.debug(
                f"Calculating confidence interval for"
                f"{df.loc[p_series.index[0],groupby_cols].values}")
            module_logger.debug(f"{p_series.values}")
            with np.errstate(all='raise'):
                try:
                    data = p_series.to_numpy()
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with input data")
                    return None
                try:
                    log_data = np.log(data)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with log function")
                    return None
                try:
                    mean = np.mean(log_data)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with mean function")
                    return None
                l = len(data)
                try:
                    sd = np.std(log_data) / np.sqrt(l)
                    sd2 = sd**2
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with std function")
                    return None
                try:
                    pi1, pi2 = t.interval(alpha=0.90,
                                          df=l - 2,
                                          loc=mean,
                                          scale=sd)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with t function")
                    return None
                try:
                    upper_interval = np.max([
                        mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                        mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                    ])
                except:
                    module_logger.debug("Problem with interval function")
                    return None
                try:
                    result = (np.exp(mean), 0, np.exp(upper_interval))
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Unable to calculate geometric_mean")
                    return None
                if result is not None:
                    return result
                else:
                    module_logger.debug(
                        f"Problem generating uncertainty parameters \n"
                        f"{df.loc[p_series.index[0],groupby_cols].values}\n"
                        f"{p_series.values}"
                        f"{p_series.values+1}")
                    return None
        else:
            return None

    def calc_geom_std(df):
        if region_agg is not None:
            debug_string = f"{df[region_agg]}-{df['FuelCategory']}-{df['FlowName']}"
        else:
            debug_string = f"{df['FuelCategory']}-{df['FlowName']}"
        module_logger.debug(debug_string)
        if df["uncertaintyLognormParams"] is None:
            return None, None
        if isinstance(df["uncertaintyLognormParams"], str):
            params = ast.literal_eval(df["uncertaintyLognormParams"])
        try:
            length = len(df["uncertaintyLognormParams"])
        except TypeError:
            module_logger.info(
                f"Error calculating length of uncertaintyLognormParams"
                f"{df['uncertaintyLognormParams']}")
            return None, None

        if length != 3:
            module_logger.info(
                f"Error estimating standard deviation - length: {len(params)}")
        else:
            # In some cases, the final emission factor is far different than the
            # geometric mean of the individual emission factor. Depending on the
            # severity, this could be a clear sign of outliers having a large impact
            # on the final emission factor. When the uncertainty is generated for
            # these cases, the results can be nonsensical - hence we skip them. A more
            # agressive approach would be to re-assign the emission factor as well.
            if df["Emission_factor"] > df["uncertaintyLognormParams"][2]:
                return None, None
            else:
                c = np.log(df["uncertaintyLognormParams"][2]) - np.log(
                    df["Emission_factor"])
                b = -2**0.5 * erfinv(2 * 0.95 - 1)
                a = 0.5
                sd1 = (-b + (b**2 - 4 * a * c)**0.5) / (2 * a)
                sd2 = (-b - (b**2 - 4 * a * c)**0.5) / (2 * a)
                if sd1 is not float("nan") and sd2 is not float("nan"):
                    if sd1 < sd2:
                        geostd = np.exp(sd1)
                        geomean = np.exp(
                            np.log(df["Emission_factor"]) - 0.5 * sd1**2)
                    else:
                        geostd = np.exp(sd2)
                        geomean = np.exp(
                            np.log(df["Emission_factor"]) - 0.5 * sd2**2)
                elif sd1 is not float("nan"):
                    geostd = np.exp(sd1)
                    geomean = np.exp(
                        np.log(df["Emission_factor"]) - 0.5 * sd1**2)
                elif sd2 is not float("nan"):
                    geostd = np.exp(sd2)
                    geomean = np.exp(
                        np.log(df["Emission_factor"]) - 0.5 * sd2**2)
                else:
                    return None, None
                if ((geostd is np.inf) or (geostd is np.NINF)
                        or (geostd is np.nan) or (geostd is float("nan"))
                        or str(geostd) == "nan" or (geostd == 0)):
                    return None, None
                return str(geomean), str(geostd)

    region_agg = subregion_col(subregion)
    fuel_agg = ["FuelCategory"]
    if region_agg:
        groupby_cols = (
            region_agg + fuel_agg +
            ["stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"])
        elec_df_groupby_cols = (region_agg + fuel_agg +
                                ["Year", "source_string"])
    else:
        groupby_cols = fuel_agg + [
            "stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"
        ]
        elec_df_groupby_cols = fuel_agg + ["Year", "source_string"]
    if model_specs.replace_egrid:
        primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year)
        primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True)
        primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int)
        key_df = (primary_fuel_df[[
            "eGRID_ID", "FuelCategory"
        ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID"))
        total_db.loc[total_db["FuelCategory"] != "ALL",
                     "FuelCategory"] = total_db["eGRID_ID"].map(
                         key_df["FuelCategory"])
    total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid")
    total_db = aggregate_facility_flows(total_db)
    total_db, electricity_df = calculate_electricity_by_source(
        total_db, subregion)
    total_db["FlowAmount"].replace(to_replace=0, value=1E-15, inplace=True)
    total_db = add_data_collection_score(total_db, electricity_df, subregion)
    total_db["facility_emission_factor"] = (total_db["FlowAmount"] /
                                            total_db["Electricity"])
    total_db.dropna(subset=["facility_emission_factor"], inplace=True)

    def wtd_mean(pdser, total_db, cols):
        try:
            wts = total_db.loc[pdser.index, "FlowAmount"]
            result = np.average(pdser, weights=wts)
        except:
            module_logger.debug(
                f"Error calculating weighted mean for {pdser.name}-"
                f"likely from 0 FlowAmounts"
                # f"{total_db.loc[pdser.index[0],cols]}"
            )
            try:
                with np.errstate(all='raise'):
                    result = np.average(pdser)
            except ArithmeticError or ValueError or FloatingPointError:
                result = float("nan")
        return result

    wm = lambda x: wtd_mean(x, total_db, groupby_cols)
    geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols)
    geo_mean.__name__ = "geo_mean"
    print(
        "Aggregating flow amounts, dqi information, and calculating uncertainty"
    )

    database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"],
                                   as_index=False).agg({
                                       "FlowAmount": ["sum", "count"],
                                       "TemporalCorrelation":
                                       wm,
                                       "TechnologicalCorrelation":
                                       wm,
                                       "GeographicalCorrelation":
                                       wm,
                                       "DataCollection":
                                       wm,
                                       "ReliabilityScore":
                                       wm,
                                       "facility_emission_factor":
                                       ["min", "max", geo_mean],
                                   })
    database_f3.columns = groupby_cols + [
        "Year",
        "source_string",
        "FlowAmount",
        "FlowAmountCount",
        "TemporalCorrelation",
        "TechnologicalCorrelation",
        "GeographicalCorrelation",
        "DataCollection",
        "ReliabilityScore",
        "uncertaintyMin",
        "uncertaintyMax",
        "uncertaintyLognormParams",
    ]

    criteria = database_f3["Compartment"] == "input"
    database_f3.loc[criteria, "uncertaintyLognormParams"] = None
    database_f3 = database_f3.merge(
        right=electricity_df,
        left_on=elec_df_groupby_cols,
        right_on=elec_df_groupby_cols,
        how="left",
    )

    canadian_criteria = database_f3["FuelCategory"] == "ALL"
    if region_agg:
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_db[groupby_cols + ["Electricity"]],
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        ).drop_duplicates(subset=groupby_cols)
    else:
        total_grouped = total_db.groupby(by=groupby_cols,
                                         as_index=False)["Electricity"].sum()
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_grouped,
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        )
    canada_db.index = database_f3.loc[canadian_criteria, :].index
    database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid",
                    "FlowUUID"] = float("nan")
    database_f3.loc[canada_db.index,
                    "electricity_sum"] = canada_db["Electricity"]
    database_f3["Emission_factor"] = (database_f3["FlowAmount"] /
                                      database_f3["electricity_sum"])
    # Infinite values generally coming from places with 0 generation. This happens
    # particularly with the Canadian mixes.
    database_f3["Emission_factor"].replace(to_replace=float("inf"),
                                           value=0,
                                           inplace=True)
    database_f3["Emission_factor"].replace(to_replace=float("-inf"),
                                           value=0,
                                           inplace=True)
    if region_agg is not None:
        database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[
            "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin",
            "uncertaintyMax", "FuelCategory", "FlowName"
        ] + region_agg].apply(calc_geom_std, axis=1))
    else:
        database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[
            "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin",
            "uncertaintyMax", "FuelCategory", "FlowName"
        ]].apply(calc_geom_std, axis=1))
    database_f3.sort_values(by=groupby_cols, inplace=True)
    return database_f3
Пример #4
0
def calculate_electricity_by_source(db, subregion="BA"):
    """
    This function calculates the electricity totals by region and source
    using the same approach as the original generation.py with attempts made to
    speed it up. That is each flow will have a source associated with it
    (eGRID, NEI, TRI, RCRAInfo). To develop an emission factor, the FlowAmount
    will need to be divided by electricity generation. This routine sums all
    electricity generation for all source/subregion combinations. So if
    a subregion aggregates FlowAmounts source from NEI and TRI then the
    denominator will be all production from plants that reported into NEI or
    TRI for that subregion.

    Parameters
    ----------
    db : dataframe
        Dataframe containing facility-level emissions as generated by
        create_generation_process_df.
    subregion : str, optional
        The level of subregion that the data will be aggregated to. Choices
        are 'all', 'NERC', 'BA', 'US', by default 'BA'
    """

    from electricitylci.aggregation_selector import subregion_col
    all_sources = '_'.join(sorted(list(db["Source"].unique())))
    power_plant_criteria = db["stage_code"] == "Power plant"
    db_powerplant = db.loc[power_plant_criteria, :]
    db_nonpower = db.loc[~power_plant_criteria, :]
    region_agg = subregion_col(subregion)

    fuel_agg = ["FuelCategory"]
    if region_agg:
        groupby_cols = (region_agg + fuel_agg +
                        ["Year", "stage_code", "FlowName", "Compartment"])
        elec_groupby_cols = region_agg + fuel_agg + ["Year"]
    else:
        groupby_cols = fuel_agg + [
            "Year",
            "stage_code",
            "FlowName",
            "Compartment",
        ]
        elec_groupby_cols = fuel_agg + ["Year"]

    combine_source_by_flow = lambda x: _combine_sources(
        x, db, ["FlowName", "Compartment"], 1)
    combine_source_lambda = lambda x: _combine_sources(x, db_multiple_sources,
                                                       groupby_cols)
    # power_db = db.loc[db["stage_code"]=='Power plant',:]

    # This is a pretty expensive process when we have to start looking at each
    # flow generated in each compartment for each balancing authority area.
    # To hopefully speed this up, we'll group by FlowName and Comparment and look
    # and try to eliminate flows where all sources are single entities.
    source_df = pd.DataFrame()
    source_df = pd.DataFrame(
        db_powerplant.groupby(["FlowName", "Compartment"
                               ])[["Source"]].apply(combine_source_by_flow),
        columns=["source_list"],
    )
    source_df[["source_list", "source_string"
               ]] = pd.DataFrame(source_df["source_list"].values.tolist(),
                                 index=source_df.index)
    source_df.reset_index(inplace=True)
    old_index = db_powerplant.index
    db_powerplant = db_powerplant.merge(
        right=source_df,
        left_on=["FlowName", "Compartment"],
        right_on=["FlowName", "Compartment"],
        how="left",
    )
    db_powerplant.index = old_index
    db_multiple_sources = db_powerplant.loc[
        db_powerplant["source_string"].isna(), :]
    if len(db_multiple_sources) > 0:
        source_df = pd.DataFrame(
            db_multiple_sources.groupby(groupby_cols)[[
                "Source"
            ]].apply(combine_source_lambda),
            columns=["source_list"],
        )
        source_df[["source_list", "source_string"
                   ]] = pd.DataFrame(source_df["source_list"].values.tolist(),
                                     index=source_df.index)
        source_df.reset_index(inplace=True)
        db_multiple_sources.drop(columns=["source_list", "source_string"],
                                 inplace=True)
        old_index = db_multiple_sources.index
        db_multiple_sources = db_multiple_sources.merge(
            right=source_df,
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        )
        db_multiple_sources.index = old_index
        # db[["source_string","source_list"]].fillna(db_multiple_sources[["source_string","source_list"]],inplace=True)
        db_powerplant.loc[
            db_powerplant["source_string"].isna(),
            ["source_string", "source_list"]] = db_multiple_sources[[
                "source_string", "source_list"
            ]]
    unique_source_lists = list(db_powerplant["source_string"].unique())
    # unique_source_lists = [x for x in unique_source_lists if ((str(x) != "nan")&(str(x)!="netl"))]
    unique_source_lists = [
        x for x in unique_source_lists if ((str(x) != "nan"))
    ]
    # One set of emissions passed into this routine may be life cycle emissions
    # used as proxies for Canadian generation. In those cases the electricity
    # generation will be equal to the Electricity already in the dataframe.

    elec_sum_lists = list()

    unique_source_lists = unique_source_lists + [all_sources]
    for src in unique_source_lists:
        module_logger.info(f"Calculating electricity for {src}")
        # src_filter = db.apply(lambda x: x["Source"] in src, axis=1)
        db["temp_src"] = src
        src_filter = [
            a in b for a, b in zip(db["Source"].values.tolist(),
                                   db["temp_src"].values.tolist())
        ]
        #        total_filter = ~fuelcat_all & src_filter
        sub_db = db.loc[src_filter, :]
        sub_db.drop_duplicates(subset=fuel_agg + ["eGRID_ID"], inplace=True)
        sub_db_group = sub_db.groupby(elec_groupby_cols, as_index=False).agg({
            "Electricity": [np.sum, np.mean],
            "eGRID_ID":
            "count"
        })
        sub_db_group.columns = elec_groupby_cols + [
            "electricity_sum",
            "electricity_mean",
            "facility_count",
        ]
        #        zero_elec_filter = sub_db_group["electricity_sum"]==0
        sub_db_group["source_string"] = src
        elec_sum_lists.append(sub_db_group)
    db_nonpower["source_string"] = all_sources
    db_nonpower["source_list"] = [all_sources] * len(db_nonpower)
    elec_sums = pd.concat(elec_sum_lists, ignore_index=True)
    elec_sums.sort_values(by=elec_groupby_cols, inplace=True)
    db = pd.concat([db_powerplant, db_nonpower])
    return db, elec_sums
def generate_regional_grid_loss(final_database, year, subregion="all"):
    """This function generates transmission and distribution losses for the
    provided generation data and given year, aggregated by subregion.

    Arguments:
        final_database: dataframe
            The database containing plant-level emissions.
        year: int
            Analysis year for the transmission and distribution loss data.
            Ideally this should match the year of your final_database.
    Returns:
        td_by_region: dataframe
            A dataframe of transmission and distribution loss rates as a
            fraction. This dataframe can be used to generate unit processes
            for transmission and distribution to match the regionally-
            aggregated emissions unit processes.
    """
    print("Generating factors for transmission and distribution losses")
    from electricitylci.eia923_generation import build_generation_data
    from electricitylci.combinator import ba_codes
    from electricitylci.egrid_facilities import egrid_facilities
    td_calc_columns = [
        "State",
        "NERC",
        "FuelCategory",
        "PrimaryFuel",
        "NERC",
        "Balancing Authority Name",
        "Electricity",
        "Year",
        "Subregion",
        "FRS_ID",
        "eGRID_ID",
    ]
    #    plant_generation = final_database[td_calc_columns].drop_duplicates()
    egrid_facilities_w_fuel_region = egrid_facilities[[
        "FacilityID", "Subregion", "PrimaryFuel", "FuelCategory", "NERC",
        "PercentGenerationfromDesignatedFuelCategory",
        "Balancing Authority Name", "Balancing Authority Code", "State"
    ]]
    egrid_facilities_w_fuel_region[
        "FacilityID"] = egrid_facilities_w_fuel_region["FacilityID"].astype(
            int)
    plant_generation = build_generation_data(generation_years=[year])
    plant_generation["FacilityID"] = plant_generation["FacilityID"].astype(int)
    plant_generation = plant_generation.merge(egrid_facilities_w_fuel_region,
                                              on=["FacilityID"],
                                              how="left")
    plant_generation["Balancing Authority Name"] = plant_generation[
        "Balancing Authority Code"].map(ba_codes["BA_Name"])
    plant_generation["FERC_Region"] = plant_generation[
        "Balancing Authority Code"].map(ba_codes["FERC_Region"])
    plant_generation["EIA_Region"] = plant_generation[
        "Balancing Authority Code"].map(ba_codes["EIA_Region"])
    td_rates = eia_trans_dist_download_extract(f"{year}")
    td_by_plant = pd.merge(
        left=plant_generation,
        right=td_rates,
        left_on="State",
        right_index=True,
        how="left",
    )
    td_by_plant.dropna(subset=["t_d_losses"], inplace=True)
    td_by_plant["t_d_losses"] = td_by_plant["t_d_losses"].astype(float)

    from electricitylci.aggregation_selector import subregion_col
    aggregation_column = subregion_col(subregion)
    wm = lambda x: np.average(x,
                              weights=td_by_plant.loc[x.index, "Electricity"])
    if aggregation_column is not None:
        td_by_region = td_by_plant.groupby(
            aggregation_column, as_index=False).agg({"t_d_losses": wm})
    else:
        td_by_region = pd.DataFrame(td_by_plant.agg({"t_d_losses": wm}),
                                    columns=["t_d_losses"])
        td_by_region["Region"] = "US"
    return td_by_region
Пример #6
0
def aggregate_data(total_db, subregion="BA"):
    """
    Aggregates facility-level emissions to the specified subregion and
    calculates emission factors based on the total emission and total
    electricity generation.

    Parameters
    ----------
    total_db : dataframe
        Facility-level emissions as generated by created by
        create_generation_process_df
    subregion : str, optional
        The level of subregion that the data will be aggregated to. Choices
        are 'all', 'NERC', 'BA', 'US', by default 'BA'.
    """
    from electricitylci.aggregation_selector import subregion_col

    def geometric_mean(p_series, df, cols):
        # I think I actually need to replace this with the function contained in
        # process_exchange_aggregator_uncertainty.py. The approach to add 1 will
        # also lead to some large errors when dealing with small numbers.
        # Alternatively we can use scipy.stats.lognorm to fit a distribution
        # and provide the parameters
        if (len(p_series) > 3) & (p_series.quantile(0.5) > 0):
            # result = gmean(p_series.to_numpy()+1)-1
            module_logger.debug(
                f"Calculating confidence interval for"
                f"{df.loc[p_series.index[0],groupby_cols].values}")
            module_logger.debug(f"{p_series.values}")
            with np.errstate(all='raise'):
                try:
                    data = p_series.to_numpy()
                except ArithmeticError or ValueError or FloatingPointError:
                    module_logger.debug("Problem with input data")
                    return None
                try:
                    log_data = np.log(data)
                except ArithmeticError or ValueError or FloatingPointError:
                    module_logger.debug("Problem with log function")
                    return None
                try:
                    mean = np.mean(log_data)
                except ArithmeticError or ValueError or FloatingPointError:
                    module_logger.debug("Problem with mean function")
                    return None
                l = len(data)
                try:
                    sd = np.std(log_data)
                    sd2 = sd**2
                except ArithmeticError or ValueError or FloatingPointError:
                    module_logger.debug("Problem with std function")
                    return None
                try:
                    pi1, pi2 = t.interval(alpha=0.90,
                                          df=l - 2,
                                          loc=mean,
                                          scale=sd)
                except ArithmeticError or ValueError or FloatingPointError:
                    module_logger.debug("Problem with t function")
                    return None
                try:
                    upper_interval = np.max([
                        mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                        mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                    ])
                except:
                    module_logger.debug("Problem with interval function")
                    return None
                try:
                    result = (np.exp(mean), 0, np.exp(upper_interval))
                except ArithmeticError or ValueError or FloatingPointError:
                    print("Prolem with result")
                    return None
                if result is not None:
                    return result
                else:
                    module_logger.debug(
                        f"Problem generating uncertainty parameters \n"
                        f"{df.loc[p_series.index[0],groupby_cols].values}\n"
                        f"{p_series.values}"
                        f"{p_series.values+1}")
                    return None
        else:
            return None

    def calc_geom_std(df):

        if df["uncertaintyLognormParams"] is None:
            return None, None
        if isinstance(df["uncertaintyLognormParams"], str):
            params = ast.literal_eval(df["uncertaintyLognormParams"])
        try:
            length = len(df["uncertaintyLognormParams"])
        except TypeError:
            module_logger.info(
                f"Error calculating length of uncertaintyLognormParams"
                f"{df['uncertaintyLognormParams']}")
            return None, None
        if length != 3:
            module_logger.info(
                f"Error estimating standard deviation - length: {len(params)}")
        try:
            geomean = df["Emission_factor"]
            geostd = np.exp((np.log(df["uncertaintyLognormParams"][2]) -
                             np.log(df["Emission_factor"])) / norm.ppf(0.95))
        except ArithmeticError:
            module_logger.info("Error estimating standard deviation")
            return None, None
        if ((geostd is np.inf) or (geostd is np.NINF) or (geostd is np.nan)
                or (geostd is float("nan")) or str(geostd) == "nan"):
            return None, None
        if geostd * geomean > df["uncertaintyMax"]:
            return None, None
        return str(geomean), str(geostd)

    region_agg = subregion_col(subregion)
    fuel_agg = ["FuelCategory"]
    if region_agg:
        groupby_cols = (region_agg + fuel_agg +
                        ["stage_code", "FlowName", "Compartment", "FlowUUID"])
        elec_df_groupby_cols = (region_agg + fuel_agg +
                                ["Year", "source_string"])
    else:
        groupby_cols = fuel_agg + [
            "stage_code",
            "FlowName",
            "Compartment",
            "FlowUUID",
        ]
        elec_df_groupby_cols = fuel_agg + ["Year", "source_string"]
    total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid")
    total_db = aggregate_facility_flows(total_db)
    total_db, electricity_df = calculate_electricity_by_source(
        total_db, subregion)
    total_db = add_data_collection_score(total_db, electricity_df, subregion)
    total_db["facility_emission_factor"] = (total_db["FlowAmount"] /
                                            total_db["Electricity"])
    total_db.dropna(subset=["facility_emission_factor"], inplace=True)

    def wtd_mean(pdser, total_db, cols):
        try:
            wts = total_db.loc[pdser.index, "Electricity"]
            result = np.average(pdser, weights=wts)
        except:
            module_logger.info(
                f"Error calculating weighted mean for {pdser.name}-"
                f"{total_db.loc[pdser.index[0],cols]}")
            result = float("nan")
        return result

    wm = lambda x: wtd_mean(x, total_db, groupby_cols)
    geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols)
    geo_mean.__name__ = "geo_mean"
    print(
        "Aggregating flow amounts, dqi information, and calculating uncertainty"
    )

    database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"],
                                   as_index=False).agg({
                                       "FlowAmount": ["sum", "count"],
                                       "TemporalCorrelation":
                                       wm,
                                       "TechnologicalCorrelation":
                                       wm,
                                       "GeographicalCorrelation":
                                       wm,
                                       "DataCollection":
                                       wm,
                                       "ReliabilityScore":
                                       wm,
                                       "facility_emission_factor":
                                       ["min", "max", geo_mean],
                                   })
    database_f3.columns = groupby_cols + [
        "Year",
        "source_string",
        "FlowAmount",
        "FlowAmountCount",
        "TemporalCorrelation",
        "TechnologicalCorrelation",
        "GeographicalCorrelation",
        "DataCollection",
        "ReliabilityScore",
        "uncertaintyMin",
        "uncertaintyMax",
        "uncertaintyLognormParams",
    ]

    criteria = database_f3["Compartment"] == "input"
    database_f3.loc[criteria, "uncertaintyLognormParams"] = None
    database_f3 = database_f3.merge(
        right=electricity_df,
        left_on=elec_df_groupby_cols,
        right_on=elec_df_groupby_cols,
        how="left",
    )

    canadian_criteria = database_f3["FuelCategory"] == "ALL"
    if region_agg:
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_db[groupby_cols + ["Electricity"]],
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        ).drop_duplicates(subset=groupby_cols)
    else:
        total_grouped = total_db.groupby(by=groupby_cols,
                                         as_index=False)["Electricity"].sum()
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_grouped,
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        )
    canada_db.index = database_f3.loc[canadian_criteria, :].index
    database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid",
                    "FlowUUID"] = float("nan")
    database_f3.loc[canada_db.index,
                    "electricity_sum"] = canada_db["Electricity"]
    database_f3["Emission_factor"] = (database_f3["FlowAmount"] /
                                      database_f3["electricity_sum"])
    database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[
        "Emission_factor",
        "uncertaintyLognormParams",
        "uncertaintyMin",
        "uncertaintyMax",
    ]].apply(calc_geom_std, axis=1))
    database_f3.sort_values(by=groupby_cols, inplace=True)
    return database_f3