def gaussian(data, wind_farm, inplace=True, curve="state"): """Impute missing data using gaussian distributions of U & V. For each missing entry, sample U & V based on mean and covariance of non-missing entries that have the same location, same month, and same hour. :param pandas.DataFrame data: data frame as returned by :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`. :param pandas.DataFrame wind_farm: data frame of wind farms. :param bool inplace: should the imputation be done in place. :param str curve: 'state' to use the state average, otherwise named curve. :return: (*pandas.DataFrame*) -- data frame with missing entries imputed. """ _check_curve(curve) data_impute = data if inplace else data.copy() to_impute = _find_to_impute(data) if to_impute is None: return # Information on wind turbines & state average tubrine curves tpc = get_turbine_power_curves() spc = get_state_power_curves() # Timestamp of all entries in data frame dates = pd.DatetimeIndex(data.index.values) n_target = len(wind_farm) select = None for i, hour in tqdm(enumerate(to_impute), total=len(to_impute)): # Only run the similar-selection function the first time if i % n_target == 0: select = _select_similar(data, dates, hour) plant_id = data.loc[hour].plant_id select_plant = select[select.plant_id == plant_id] uv_data = np.array( [select_plant["U"].to_numpy(), select_plant["V"].to_numpy()]) cov = np.cov(uv_data) mean = np.mean(uv_data, axis=1) sample = np.random.multivariate_normal(mean=mean, cov=cov, size=1) data_impute.at[hour, "U"] = sample[0][0] data_impute.at[hour, "V"] = sample[0][1] wspd = np.sqrt(data.loc[hour].U**2 + data.loc[hour].V**2) normalized_power = get_power(tpc, spc, wspd, "IEC class 2") data_impute.at[hour, "Pout"] = normalized_power if not inplace: return data_impute
def simple(data, wind_farm, inplace=True, curve="state"): """Impute missing data using a simple procedure. For each missing entry, the extrema of the U and V components of the wind speed of all non missing entries that have the same location, same month, same hour are first found for each missing entry. Then, a U and V value are randomly generated between the respective derived ranges. :param pandas.DataFrame data: data frame as returned by :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`. :param pandas.DataFrame wind_farm: data frame of wind farms. :param bool inplace: should the imputation be done in place. :param str curve: 'state' to use the state average, otherwise named curve. :return: (*pandas.DataFrame*) -- data frame with missing entries imputed. """ _check_curve(curve) data_impute = data if inplace else data.copy() to_impute = _find_to_impute(data) if to_impute is None: return # Information on wind turbines & state average tubrine curves tpc = get_turbine_power_curves() spc = get_state_power_curves() # Timestamp of all entries in data frame dates = pd.DatetimeIndex(data.index.values) n_target = len(wind_farm) select = None for i, j in tqdm(enumerate(to_impute), total=len(to_impute)): if i % n_target == 0: select = _select_similar(data, dates, j) k = data.loc[j].plant_id select_plant = select[select.plant_id == k] min_u, max_u = select_plant["U"].min(), select_plant["U"].max() min_v, max_v = select_plant["V"].min(), select_plant["V"].max() data_impute.at[j, "U"] = min_u + (max_u - min_u) * np.random.random() data_impute.at[j, "V"] = min_v + (max_v - min_v) * np.random.random() wspd = np.sqrt(data.loc[j].U**2 + data.loc[j].V**2) normalized_power = get_power(tpc, spc, wspd, "IEC class 2") data_impute.at[j, "Pout"] = normalized_power if not inplace: return data_impute
def test_get_state_power_curves(self): state_power_curves = get_state_power_curves() self.assertIsInstance(state_power_curves, pd.DataFrame) self.assertEqual(state_power_curves.index.name, "Speed bin (m/s)")
def setUp(self): self.tpc = get_turbine_power_curves() self.spc = get_state_power_curves()
def retrieve_data(wind_farm, start_date="2016-01-01", end_date="2016-12-31"): """Retrieve wind speed data from NOAA's server. :param pandas.DataFrame wind_farm: plant data frame. :param str start_date: start date. :param str end_date: end date (inclusive). :return: (*tuple*) -- First element is a pandas data frame with *'plant_id'*, *'U'*, *'V'*, *'Pout'*, *'ts'* and *'ts_id'* as columns. The power output is given for a 1MW generator and the U and V component of the wind speed 80-m above ground level are in m/s. Second element is a list of missing files. """ # Define query box boundaries using the most northern, southern, eastern # and western. Add 1deg in each direction north_box = wind_farm.lat.max() + 1 south_box = wind_farm.lat.min() - 1 west_box = wind_farm.lon.min() - 1 east_box = wind_farm.lon.max() + 1 # Information on wind turbines & state average tubrine curves tpc = get_turbine_power_curves() spc = get_state_power_curves() # Information on wind farms n_target = len(wind_farm) lon_target = wind_farm.lon.values lat_target = wind_farm.lat.values id_target = wind_farm.index.values state_target = [ "Offshore" if wind_farm.loc[i].type == "wind_offshore" else id2abv[wind_farm.loc[i].zone_id] for i in id_target ] start = datetime.datetime.strptime(start_date, "%Y-%m-%d") end = datetime.datetime.strptime(end_date, "%Y-%m-%d") box = { "north": north_box, "south": south_box, "west": west_box, "east": east_box } noaa = NoaaApi(box) url_count = len(noaa.get_path_list(start, end)) missing = [] target2grid = OrderedDict() size = url_count * n_target data = pd.DataFrame({ "plant_id": [0] * size, "ts": [np.nan] * size, "ts_id": [0] * size, "U": [0] * size, "V": [0] * size, "Pout": [0] * size, }) dt = datetime.datetime.strptime(start_date, "%Y-%m-%d") step = datetime.timedelta(hours=1) def calc_angular_dist(lon_grid, lat_grid): n_grid = len(lon_grid) for j in range(n_target): uv_target = ll2uv(lon_target[j], lat_target[j]) angle = [ angular_distance(uv_target, ll2uv(lon_grid[k], lat_grid[k])) for k in range(n_grid) ] target2grid[id_target[j]] = np.argmin(angle) def handle_missing(response, data_tmp): missing.append(response.url) # missing data are set to NaN. data_tmp["U"] = [np.nan] * n_target data_tmp["V"] = [np.nan] * n_target data_tmp["Pout"] = [np.nan] * n_target first = True request_iter = enumerate(noaa.get_hourly_data(start, end)) for i, response in tqdm(request_iter, total=url_count): data_tmp = pd.DataFrame({ "plant_id": id_target, "ts": [dt] * n_target, "ts_id": [i + 1] * n_target }) if response.status_code == 200: try: # see demo notebook to understand file structure tmp = Dataset("tmp.nc", "r", memory=response.content) lon_grid = tmp.variables["lon"][:].flatten() lat_grid = tmp.variables["lat"][:].flatten() u_wsp = tmp.variables[NoaaApi.var_u][0, 1, :, :].flatten() v_wsp = tmp.variables[NoaaApi.var_v][0, 1, :, :].flatten() if first: # The angular distance is calculated once. The target to grid # correspondence is stored in a dictionary. calc_angular_dist(lon_grid, lat_grid) first = False data_tmp["U"] = [ u_wsp[target2grid[id_target[j]]] for j in range(n_target) ] data_tmp["V"] = [ v_wsp[target2grid[id_target[j]]] for j in range(n_target) ] wspd_target = np.sqrt( pow(data_tmp["U"], 2) + pow(data_tmp["V"], 2)) power = [ get_power(tpc, spc, wspd_target[j], state_target[j]) for j in range(n_target) ] data_tmp["Pout"] = power except Exception: print(f"Failed to parse response from url={response.url}") handle_missing(response, data_tmp) else: handle_missing(response, data_tmp) data.iloc[i * n_target:(i + 1) * n_target, :] = data_tmp.values dt += step # Format data frame data["plant_id"] = data["plant_id"].astype(np.int32) data["ts_id"] = data["ts_id"].astype(np.int32) data["U"] = data["U"].astype(np.float32) data["V"] = data["V"].astype(np.float32) data["Pout"] = data["Pout"].astype(np.float32) data.sort_values(by=["ts_id", "plant_id"], inplace=True) data.reset_index(inplace=True, drop=True) return data, missing
def calculate_pout(wind_farms, start_dt, end_dt, directory): """Calculate power output for wind farms based on hrrr data. Function assumes that user has already called :meth:`prereise.gather.winddata.hrrr.hrrr.retrieve_data` with the same start_dt, end_dt, and directory. :param pandas.DataFrame wind_farms: plant data frame. :param str start_dt: start date. :param str end_dt: end date (inclusive). :param str directory: directory where hrrr data is contained. :return: (*pandas.Dataframe*) -- Pandas containing power out per wind farm on a per hourly basis between start_dt and end_dt inclusive. Structure of dataframe is: wind_farm1 wind_farm2 dt1 POUT POUT dt2 POUT POUT """ turbine_types = wind_farms.apply( lambda x: "Offshore" if x["type"] == "wind_offshore" else id2abv[x["zone_id"]], axis=1, ) turbine_power_curves = get_turbine_power_curves() state_power_curves = get_state_power_curves() wind_data_lat_long = get_wind_data_lat_long(start_dt, directory) wind_farm_to_closest_wind_grid_indices = find_closest_wind_grids( wind_farms, wind_data_lat_long ) dts = pd.date_range(start=start_dt, end=end_dt, freq="H").to_pydatetime() # Fetch wind speed data for each wind farm (or store NaN as applicable) wind_speed_data = pd.DataFrame(index=dts, columns=wind_farms.index, dtype=float) for dt in tqdm(dts): gribs = pygrib.open(formatted_filename(dt)) try: u_component = gribs.select(name=U_COMPONENT_SELECTOR)[0].values.flatten() v_component = gribs.select(name=V_COMPONENT_SELECTOR)[0].values.flatten() wind_farm_specific_u_component = u_component[ wind_farm_to_closest_wind_grid_indices ] wind_farm_specific_v_component = v_component[ wind_farm_to_closest_wind_grid_indices ] wind_speed_data.loc[dt] = np.sqrt( pow(wind_farm_specific_u_component, 2) + pow(wind_farm_specific_v_component, 2) ) except ValueError: # If the GRIB file is empty, no wind speed values can be selected wind_speed_data.loc[dt] = np.nan # For each column, linearly interpolate any NaN values linear(wind_speed_data) # Then calculate wind power based on wind speed wind_power_data = [ [ get_power( turbine_power_curves, state_power_curves, wind_speed_data.loc[dt, w], turbine_types.loc[w], ) for w in wind_farms.index ] for dt in tqdm(dts) ] df = pd.DataFrame(data=wind_power_data, index=dts, columns=wind_farms.index) return df