Exemplo n.º 1
0
def gaussian(data, wind_farm, inplace=True, curve="state"):
    """Impute missing data using gaussian distributions of U & V. For each
    missing entry, sample U & V based on mean and covariance of non-missing
    entries that have the same location, same month, and same hour.

    :param pandas.DataFrame data: data frame as returned by
        :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`.
    :param pandas.DataFrame wind_farm: data frame of wind farms.
    :param bool inplace: should the imputation be done in place.
    :param str curve: 'state' to use the state average, otherwise named curve.
    :return: (*pandas.DataFrame*) -- data frame with missing entries imputed.
    """

    _check_curve(curve)
    data_impute = data if inplace else data.copy()
    to_impute = _find_to_impute(data)
    if to_impute is None:
        return

    # Information on wind turbines & state average tubrine curves
    tpc = get_turbine_power_curves()
    spc = get_state_power_curves()

    # Timestamp of all entries in data frame
    dates = pd.DatetimeIndex(data.index.values)

    n_target = len(wind_farm)
    select = None
    for i, hour in tqdm(enumerate(to_impute), total=len(to_impute)):
        # Only run the similar-selection function the first time
        if i % n_target == 0:
            select = _select_similar(data, dates, hour)

        plant_id = data.loc[hour].plant_id
        select_plant = select[select.plant_id == plant_id]

        uv_data = np.array(
            [select_plant["U"].to_numpy(), select_plant["V"].to_numpy()])
        cov = np.cov(uv_data)
        mean = np.mean(uv_data, axis=1)
        sample = np.random.multivariate_normal(mean=mean, cov=cov, size=1)
        data_impute.at[hour, "U"] = sample[0][0]
        data_impute.at[hour, "V"] = sample[0][1]

        wspd = np.sqrt(data.loc[hour].U**2 + data.loc[hour].V**2)
        normalized_power = get_power(tpc, spc, wspd, "IEC class 2")
        data_impute.at[hour, "Pout"] = normalized_power

    if not inplace:
        return data_impute
Exemplo n.º 2
0
def simple(data, wind_farm, inplace=True, curve="state"):
    """Impute missing data using a simple procedure. For each missing entry,
    the extrema of the U and V components of the wind speed of all non missing
    entries that have the same location, same month, same hour are first found
    for each missing entry. Then, a U and V value are randomly generated
    between the respective derived ranges.

    :param pandas.DataFrame data: data frame as returned by
        :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`.
    :param pandas.DataFrame wind_farm: data frame of wind farms.
    :param bool inplace: should the imputation be done in place.
    :param str curve: 'state' to use the state average, otherwise named curve.
    :return: (*pandas.DataFrame*) -- data frame with missing entries imputed.
    """

    _check_curve(curve)
    data_impute = data if inplace else data.copy()
    to_impute = _find_to_impute(data)
    if to_impute is None:
        return

    # Information on wind turbines & state average tubrine curves
    tpc = get_turbine_power_curves()
    spc = get_state_power_curves()

    # Timestamp of all entries in data frame
    dates = pd.DatetimeIndex(data.index.values)

    n_target = len(wind_farm)
    select = None
    for i, j in tqdm(enumerate(to_impute), total=len(to_impute)):
        if i % n_target == 0:
            select = _select_similar(data, dates, j)

        k = data.loc[j].plant_id
        select_plant = select[select.plant_id == k]

        min_u, max_u = select_plant["U"].min(), select_plant["U"].max()
        min_v, max_v = select_plant["V"].min(), select_plant["V"].max()
        data_impute.at[j, "U"] = min_u + (max_u - min_u) * np.random.random()
        data_impute.at[j, "V"] = min_v + (max_v - min_v) * np.random.random()
        wspd = np.sqrt(data.loc[j].U**2 + data.loc[j].V**2)
        normalized_power = get_power(tpc, spc, wspd, "IEC class 2")
        data_impute.at[j, "Pout"] = normalized_power

    if not inplace:
        return data_impute
Exemplo n.º 3
0
 def test_get_state_power_curves(self):
     state_power_curves = get_state_power_curves()
     self.assertIsInstance(state_power_curves, pd.DataFrame)
     self.assertEqual(state_power_curves.index.name, "Speed bin (m/s)")
Exemplo n.º 4
0
 def setUp(self):
     self.tpc = get_turbine_power_curves()
     self.spc = get_state_power_curves()
Exemplo n.º 5
0
def retrieve_data(wind_farm, start_date="2016-01-01", end_date="2016-12-31"):
    """Retrieve wind speed data from NOAA's server.

    :param pandas.DataFrame wind_farm: plant data frame.
    :param str start_date: start date.
    :param str end_date: end date (inclusive).
    :return: (*tuple*) -- First element is a pandas data frame with
        *'plant_id'*, *'U'*, *'V'*, *'Pout'*, *'ts'* and *'ts_id'* as columns.
        The power output is given for a 1MW generator and the U and V component of
        the wind speed 80-m above ground level are in m/s. Second element is a list
        of missing files.
    """

    # Define query box boundaries using the most northern, southern, eastern
    # and western. Add 1deg in each direction
    north_box = wind_farm.lat.max() + 1
    south_box = wind_farm.lat.min() - 1
    west_box = wind_farm.lon.min() - 1
    east_box = wind_farm.lon.max() + 1

    # Information on wind turbines & state average tubrine curves
    tpc = get_turbine_power_curves()
    spc = get_state_power_curves()

    # Information on wind farms
    n_target = len(wind_farm)

    lon_target = wind_farm.lon.values
    lat_target = wind_farm.lat.values
    id_target = wind_farm.index.values
    state_target = [
        "Offshore" if wind_farm.loc[i].type == "wind_offshore" else
        id2abv[wind_farm.loc[i].zone_id] for i in id_target
    ]

    start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.datetime.strptime(end_date, "%Y-%m-%d")

    box = {
        "north": north_box,
        "south": south_box,
        "west": west_box,
        "east": east_box
    }
    noaa = NoaaApi(box)
    url_count = len(noaa.get_path_list(start, end))

    missing = []
    target2grid = OrderedDict()
    size = url_count * n_target
    data = pd.DataFrame({
        "plant_id": [0] * size,
        "ts": [np.nan] * size,
        "ts_id": [0] * size,
        "U": [0] * size,
        "V": [0] * size,
        "Pout": [0] * size,
    })

    dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    step = datetime.timedelta(hours=1)

    def calc_angular_dist(lon_grid, lat_grid):
        n_grid = len(lon_grid)
        for j in range(n_target):
            uv_target = ll2uv(lon_target[j], lat_target[j])
            angle = [
                angular_distance(uv_target, ll2uv(lon_grid[k], lat_grid[k]))
                for k in range(n_grid)
            ]
            target2grid[id_target[j]] = np.argmin(angle)

    def handle_missing(response, data_tmp):
        missing.append(response.url)

        # missing data are set to NaN.
        data_tmp["U"] = [np.nan] * n_target
        data_tmp["V"] = [np.nan] * n_target
        data_tmp["Pout"] = [np.nan] * n_target

    first = True
    request_iter = enumerate(noaa.get_hourly_data(start, end))
    for i, response in tqdm(request_iter, total=url_count):

        data_tmp = pd.DataFrame({
            "plant_id": id_target,
            "ts": [dt] * n_target,
            "ts_id": [i + 1] * n_target
        })

        if response.status_code == 200:
            try:
                # see demo notebook to understand file structure
                tmp = Dataset("tmp.nc", "r", memory=response.content)
                lon_grid = tmp.variables["lon"][:].flatten()
                lat_grid = tmp.variables["lat"][:].flatten()
                u_wsp = tmp.variables[NoaaApi.var_u][0, 1, :, :].flatten()
                v_wsp = tmp.variables[NoaaApi.var_v][0, 1, :, :].flatten()

                if first:
                    # The angular distance is calculated once. The target to grid
                    # correspondence is stored in a dictionary.
                    calc_angular_dist(lon_grid, lat_grid)
                    first = False

                data_tmp["U"] = [
                    u_wsp[target2grid[id_target[j]]] for j in range(n_target)
                ]
                data_tmp["V"] = [
                    v_wsp[target2grid[id_target[j]]] for j in range(n_target)
                ]
                wspd_target = np.sqrt(
                    pow(data_tmp["U"], 2) + pow(data_tmp["V"], 2))
                power = [
                    get_power(tpc, spc, wspd_target[j], state_target[j])
                    for j in range(n_target)
                ]
                data_tmp["Pout"] = power
            except Exception:
                print(f"Failed to parse response from url={response.url}")
                handle_missing(response, data_tmp)
        else:
            handle_missing(response, data_tmp)

        data.iloc[i * n_target:(i + 1) * n_target, :] = data_tmp.values
        dt += step

    # Format data frame
    data["plant_id"] = data["plant_id"].astype(np.int32)
    data["ts_id"] = data["ts_id"].astype(np.int32)
    data["U"] = data["U"].astype(np.float32)
    data["V"] = data["V"].astype(np.float32)
    data["Pout"] = data["Pout"].astype(np.float32)

    data.sort_values(by=["ts_id", "plant_id"], inplace=True)
    data.reset_index(inplace=True, drop=True)
    return data, missing
Exemplo n.º 6
0
def calculate_pout(wind_farms, start_dt, end_dt, directory):
    """Calculate power output for wind farms based on hrrr data.
    Function assumes that user has already called
    :meth:`prereise.gather.winddata.hrrr.hrrr.retrieve_data` with the same
    start_dt, end_dt, and directory.

    :param pandas.DataFrame wind_farms: plant data frame.
    :param str start_dt: start date.
    :param str end_dt: end date (inclusive).
    :param str directory: directory where hrrr data is contained.
    :return: (*pandas.Dataframe*) -- Pandas containing power out per wind farm
        on a per hourly basis between start_dt and end_dt inclusive. Structure of
        dataframe is:
            wind_farm1  wind_farm2
        dt1    POUT        POUT
        dt2    POUT        POUT
    """

    turbine_types = wind_farms.apply(
        lambda x: "Offshore" if x["type"] == "wind_offshore" else id2abv[x["zone_id"]],
        axis=1,
    )

    turbine_power_curves = get_turbine_power_curves()
    state_power_curves = get_state_power_curves()
    wind_data_lat_long = get_wind_data_lat_long(start_dt, directory)
    wind_farm_to_closest_wind_grid_indices = find_closest_wind_grids(
        wind_farms, wind_data_lat_long
    )
    dts = pd.date_range(start=start_dt, end=end_dt, freq="H").to_pydatetime()
    # Fetch wind speed data for each wind farm (or store NaN as applicable)
    wind_speed_data = pd.DataFrame(index=dts, columns=wind_farms.index, dtype=float)
    for dt in tqdm(dts):
        gribs = pygrib.open(formatted_filename(dt))
        try:
            u_component = gribs.select(name=U_COMPONENT_SELECTOR)[0].values.flatten()
            v_component = gribs.select(name=V_COMPONENT_SELECTOR)[0].values.flatten()
            wind_farm_specific_u_component = u_component[
                wind_farm_to_closest_wind_grid_indices
            ]
            wind_farm_specific_v_component = v_component[
                wind_farm_to_closest_wind_grid_indices
            ]
            wind_speed_data.loc[dt] = np.sqrt(
                pow(wind_farm_specific_u_component, 2)
                + pow(wind_farm_specific_v_component, 2)
            )
        except ValueError:
            # If the GRIB file is empty, no wind speed values can be selected
            wind_speed_data.loc[dt] = np.nan

    # For each column, linearly interpolate any NaN values
    linear(wind_speed_data)
    # Then calculate wind power based on wind speed
    wind_power_data = [
        [
            get_power(
                turbine_power_curves,
                state_power_curves,
                wind_speed_data.loc[dt, w],
                turbine_types.loc[w],
            )
            for w in wind_farms.index
        ]
        for dt in tqdm(dts)
    ]
    df = pd.DataFrame(data=wind_power_data, index=dts, columns=wind_farms.index)

    return df