def get_previous_dataset(self, dataset: xr.Dataset) -> xr.Dataset: """Utility method to retrieve the previous set of data for hte same datastream as the provided dataset from the DatastreamStorage. :param dataset: The reference dataset that will be used to search the DatastreamStore for prior data. :type dataset: xr.Dataset :return: The previous dataset from the DatastreamStorage if it exists, otherwise None. :rtype: xr.Dataset """ prev_dataset = None start_date, start_time = DSUtil.get_start_time(dataset) datastream_name = DSUtil.get_datastream_name(dataset, self.config) with self.storage.fetch_previous_file( datastream_name, f"{start_date}.{start_time}" ) as netcdf_file: if netcdf_file: prev_dataset = self.storage.handlers.read( file=netcdf_file, name=netcdf_file ) return prev_dataset
def test_corrections_are_recorded(dataset): DSUtil.record_corrections_applied( ds=dataset, variable="uninitialized_var", correction="Variable was initialized to _FillValue", ) assert ATTS.CORRECTIONS_APPLIED in dataset["uninitialized_var"].attrs
def _replace_invalid_values(self, fill_value, variable_name: str): valid_min = DSUtil.get_valid_min(self.ds, variable_name) valid_max = DSUtil.get_valid_max(self.ds, variable_name) if valid_min is not None and valid_max is not None: values = self.ds[variable_name].values keep_array = np.logical_not( (values < valid_min) | (values > valid_max)) replaced_values = np.where(keep_array, values, fill_value) self.ds[variable_name].data = replaced_values
def save_local_path(self, local_path: str, new_filename: str = None) -> Any: # TODO: we should perform a REGEX check to make sure that the filename is valid filename = os.path.basename(local_path) if not new_filename else new_filename datastream_name = DSUtil.get_datastream_name_from_filename(filename) dest_dir = DSUtil.get_datastream_directory(datastream_name=datastream_name, root=self._root) os.makedirs(dest_dir, exist_ok=True) # make sure the dest folder exists dest_path = os.path.join(dest_dir, filename) shutil.copy(local_path, dest_path) return dest_path
def save_local_path(self, local_path: str, new_filename: str = None): # TODO: we should perform a REGEX check to make sure that the filename is valid filename = os.path.basename( local_path) if not new_filename else new_filename datastream_name = DSUtil.get_datastream_name_from_filename(filename) subpath = DSUtil.get_datastream_directory( datastream_name=datastream_name) s3_path = self.root.join(subpath, filename) self.tmp.upload(local_path, s3_path) return s3_path
def __init__( self, ds: xr.Dataset, config: Config, definition: QualityManagerDefinition, previous_data: xr.Dataset, ): # Get the variables this quality manager applies to variable_names = definition.variables # Convert the list to upper case in case the user made a typo in the yaml variable_names_upper = [x.upper() for x in variable_names] # Add variables where a keyword was used if VARS.COORDS in variable_names_upper: variable_names.remove(VARS.COORDS) variable_names.extend(DSUtil.get_coordinate_variable_names(ds)) if VARS.DATA_VARS in variable_names_upper: variable_names.remove(VARS.DATA_VARS) variable_names.extend(DSUtil.get_non_qc_variable_names(ds)) if VARS.ALL in variable_names_upper: variable_names.remove(VARS.ALL) variable_names.extend(DSUtil.get_coordinate_variable_names(ds)) variable_names.extend(DSUtil.get_non_qc_variable_names(ds)) # Remove any duplicates while preserving insertion order variable_names = list(dict.fromkeys(variable_names)) # Exclude any excludes excludes = definition.exclude for exclude in excludes: variable_names.remove(exclude) # Get the quality checker quality_checker = instantiate_handler(ds, previous_data, definition, handler_desc=definition.checker) # Get the quality handlers handlers = definition.handlers self.ds = ds self.config = config self.variable_names = variable_names self.checker = quality_checker self.handlers = handlers self.definition: QualityManagerDefinition = definition self.previous_data = previous_data
def record_correction(self, variable_name: str): """If a correction was made to variable data to fix invalid values as detected by a quality check, this method will record the fix to the appropriate variable attribute. The correction description will come from the handler params which get set in the pipeline config file. :param variable_name: Name :type variable_name: str """ correction = self.params.get("correction", None) if correction is not None: DSUtil.record_corrections_applied(self.ds, variable_name, correction)
def create_and_persist_plots(self, dataset: xr.Dataset): ds = dataset filename = DSUtil.get_plot_filename(dataset, "Three Phase Voltage", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Calculations for contour plots date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y') #hi = np.ceil(ds.wind_speed.max().data + 1) #lo = np.floor(ds.wind_speed.min().data) #levels = np.arange(lo, hi, 1) # Colormaps to use #wind_cmap = cmocean.cm.deep_r #avail_cmap = cmocean.cm.amp_r # Create figure and axes objects fig, ax = plt.subplots(figsize=(16, 8), constrained_layout=True) fig.suptitle( f"Three Phase Voltage from {ds.attrs['title']} on {date}") ds.MODAQ_Va[:100].plot(ax=ax) ds.MODAQ_Vb[:100].plot(ax=ax) ds.MODAQ_Vc[:100].plot(ax=ax) # Save the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() return
def run(self, variable_name: str) -> Optional[np.ndarray]: # If this is a time variable, we check for 'NaT' if self.ds[variable_name].data.dtype.type == np.datetime64: results_array = np.isnat(self.ds[variable_name].data) else: fill_value = DSUtil.get_fill_value(self.ds, variable_name) # If the variable has no _FillValue attribute, then # we select a default value to use if fill_value is None: fill_value = -9999 # Make sure fill value has same data type as the variable fill_value = np.array(fill_value, dtype=self.ds[variable_name].data.dtype.type) # First check if any values are assigned to _FillValue results_array = np.equal(self.ds[variable_name].data, fill_value) # Then, if the value is numeric, we should also check if any values are assigned to NaN if self.ds[variable_name].data.dtype.type in ( type(0.0), np.float16, np.float32, np.float64, ): results_array |= np.isnan(self.ds[variable_name].data) return results_array
def read_and_persist_raw_files( self, filepaths: Union[str, List[str]] ) -> Dict[str, xr.Dataset]: """------------------------------------------------------------------------------------ Renames the provided raw files according to our naming conventions and returns a mapping of the renamed filepaths to raw `xr.Dataset` objects. Args: file_paths (List[str]): The path(s) to the raw file(s). Returns: Dict[str, xr.Dataset]: The mapping of raw filepaths to raw xr.Dataset objects. ------------------------------------------------------------------------------------""" raw_mapping: Dict[str, xr.Dataset] = dict() if isinstance(filepaths, str): filepaths = [filepaths] for filepath in filepaths: extracted = self.storage.handlers.read(file=filepath, name=filepath) if not extracted: warnings.warn(f"Couldn't use extracted raw file: {filepath}") continue new_filename = DSUtil.get_raw_filename(extracted, filepath, self.config) self.storage.save(filepath, new_filename=new_filename) if isinstance(extracted, xr.Dataset): extracted = {new_filename: extracted} raw_mapping.update(extracted) return raw_mapping
def fetch_previous_file(self, datastream_name: str, start_time: str) -> DisposableLocalTempFile: # fetch files one day previous and one day after start date (since find is exclusive) date = datetime.datetime.strptime(start_time, "%Y%m%d.%H%M%S") prev_date = (date - datetime.timedelta(days=1)).strftime("%Y%m%d.%H%M%S") next_date = (date + datetime.timedelta(days=1)).strftime("%Y%m%d.%H%M%S") files = self.find( datastream_name, prev_date, next_date, filetype=DatastreamStorage.default_file_type, ) dates = [DSUtil.get_date_from_filename(_file) for _file in files] previous_filepath = None if dates: i = bisect.bisect_left(dates, start_time) if i > 0: previous_filepath = files[i - 1] if previous_filepath is None: return DisposableLocalTempFile(previous_filepath) return self._tmp.fetch(previous_filepath)
def run(self, variable_name: str) -> Optional[np.ndarray]: # If this is a time variable, we check for 'NaT' if self.ds[variable_name].values.dtype.type == np.datetime64: results_array = np.isnat(self.ds[variable_name].values) else: fill_value = DSUtil.get_fill_value(self.ds, variable_name) # If the variable has no _FillValue attribute, then # we select a default value to use if fill_value is None: fill_value = -9999 # Make sure fill value has same data type as the variable fill_value = np.array(fill_value, dtype=self.ds[variable_name].values.dtype.type) # First replace any values that are outside valid_range to be fill_value so # it will get flagged as missing self._replace_invalid_values(fill_value, variable_name) # First check if any values are assigned to _FillValue results_array = np.equal(self.ds[variable_name].values, fill_value) # Then, if the value is numeric, we should also check if any values are assigned to # NaN if self.ds[variable_name].values.dtype.type in (type(0.0), np.float16, np.float32, np.float64): results_array |= np.isnan(self.ds[variable_name].values) return results_array
def write(self, ds: xr.Dataset, filename: str, config: Config = None, **kwargs) -> None: """Saves the given dataset to a csv file. :param ds: The dataset to save. :type ds: xr.Dataset :param filename: The path to where the file should be written to. :type filename: str :param config: Optional Config object, defaults to None :type config: Config, optional """ if len(ds.dims) > 1: raise TypeError("Dataset has more than one dimension, so it can't be saved to csv. Try netcdf instead.") write_params = self.parameters.get('write', {}) to_dataframe_kwargs = write_params.get('to_dataframe', {}) to_csv_kwargs = dict(index=False) to_csv_kwargs.update(write_params.get('to_csv', {})) df = ds.to_dataframe(**to_dataframe_kwargs) df.to_csv(filename, **to_csv_kwargs) yaml_filename = f"{filename}.yaml" with open(yaml_filename, 'w') as file: metadata = DSUtil.get_metadata(ds) yaml.dump(metadata, file)
def find(self, datastream_name: str, start_time: str, end_time: str, filetype: str = None) -> List[str]: # TODO: think about refactoring so that you don't need both start and end time # TODO: if times don't include hours/min/sec, then add .000000 to the string dir_to_check = DSUtil.get_datastream_directory(datastream_name=datastream_name, root=self._root) storage_paths = [] if os.path.isdir(dir_to_check): for file in os.listdir(dir_to_check): if start_time <= DSUtil.get_date_from_filename(file) < end_time: storage_paths.append(os.path.join(dir_to_check, file)) if filetype is not None: filter_func = DatastreamStorage.file_filters[filetype] storage_paths = list(filter(filter_func, storage_paths)) return sorted(storage_paths)
def run(self, variable_name: str, results_array: np.ndarray): if results_array.any(): fill_value = DSUtil.get_fill_value(self.ds, variable_name) keep_array = np.logical_not(results_array) var_values = self.ds[variable_name].data replaced_values = np.where(keep_array, var_values, fill_value) self.ds[variable_name].data = replaced_values self.record_correction(variable_name)
def run(self, variable_name: str) -> Optional[np.ndarray]: valid_delta = self.ds[variable_name].attrs.get(ATTS.VALID_DELTA, None) # If no valid_delta is available, then we just skip this definition results_array = None if valid_delta is not None: # We need to get the dim to diff on from the parameters # If dim is not specified, then we use the first dim for the variable dim = self.params.get('dim', None) if dim is None and len(self.ds[variable_name].dims) > 0: dim = self.ds[variable_name].dims[0] if dim is not None: # If previous data exists, then we must add the last row of # previous data as the first row of the variable's data array. # This is so that the diff function can compare the first value # of the file to make sure it is consistent with the previous file. # convert to np array variable_data = self.ds[variable_name].data axis = self.ds[variable_name].get_axis_num(dim) previous_row = None # Load the previous row from the other dataset if self.previous_data is not None: previous_variable_data = self.previous_data.get(variable_name, None) if previous_variable_data is not None: # convert to np array previous_variable_data = previous_variable_data.data # Get the last value from the first axis previous_row = previous_variable_data[-1] # Insert that value as the first value of the first axis variable_data = np.insert(variable_data, 0, previous_row, axis=axis) # If the variable is a time variable, then we convert to nanoseconds before doing our check if self.ds[variable_name].values.dtype.type == np.datetime64: variable_data = DSUtil.datetime64_to_timestamp(variable_data) # Compute the difference between each two numbers and check if it exceeds valid_delta diff = np.absolute(np.diff(variable_data, axis=axis)) results_array = np.greater(diff, valid_delta) if previous_row is None: # This means our results array is missing one value for the first row, which is # not included in the diff computation. # We need to add False for the first row of results, since it won't fail # the check. first_row = np.zeros(results_array[0].size, dtype=bool) results_array = np.insert(results_array, 0, first_row, axis=axis) return results_array
def run(self, variable_name: str) -> Optional[np.ndarray]: results_array = None # We need to get the dim to diff on from the parameters # If dim is not specified, then we use the first dim for the variable dim = self.params.get("dim", None) if dim is None and len(self.ds[variable_name].dims) > 0: dim = self.ds[variable_name].dims[0] if dim is not None: # If previous data exists, then we must add the last row of # previous data as the first row of the variable's data array. # This is so that the diff function can compare the first value # of the file to make sure it is consistent with the previous file. # convert to np array variable_data = self.ds[variable_name].data axis = self.ds[variable_name].get_axis_num(dim) previous_row = None # Load the previous row from the other dataset if self.previous_data is not None and dim == "time": previous_variable_data = self.previous_data.get( variable_name, None) if previous_variable_data is not None: # convert to np array previous_variable_data = previous_variable_data.data # Get the last value from the first axis previous_row = previous_variable_data[-1] # Insert that value as the first value of the first axis variable_data = np.insert(variable_data, 0, previous_row, axis=axis) # If the variable is a time variable, then we convert to nanoseconds before doing our check if self.ds[variable_name].data.dtype.type == np.datetime64: variable_data = DSUtil.datetime64_to_timestamp(variable_data) # Compute the difference between each two numbers and check if they are either all # increasing or all decreasing diff = np.diff(variable_data, axis=axis) is_monotonic = np.all(diff > 0) | np.all( diff < 0) # this returns a scalar # Create a results array, with all values set to the results of the is_monotonic check results_array = np.full(variable_data.shape, not is_monotonic, dtype=bool) return results_array
def find(self, datastream_name: str, start_time: str, end_time: str, filetype: str = None) -> List[S3Path]: # TODO: think about refactoring so that you don't need both start and end time # TODO: if times don't include hours/min/sec, then add .000000 to the string subpath = DSUtil.get_datastream_directory( datastream_name=datastream_name) dir_to_check = self.root.join(subpath) storage_paths = [] for file in self.tmp.listdir(dir_to_check): if start_time <= DSUtil.get_date_from_filename( file.bucket_path) < end_time: storage_paths.append(file) if filetype is not None: filter_func = DatastreamStorage.file_filters[filetype] storage_paths = list(filter(filter_func, storage_paths)) return sorted(storage_paths)
def test_plotting_utilities(dataset): expected_filename = "test.SortedDataset.a1.20211001.000000.height.png" filename = DSUtil.get_plot_filename(dataset, "height", "png") filepath = os.path.join(STORAGE_PATH, "test.SortedDataset.a1", filename) assert filename == expected_filename assert DSUtil.get_date_from_filename(filepath) == "20211001.000000" DSUtil.plot_qc(dataset, "height_out", filepath) assert DSUtil.is_image(filepath) assert not DSUtil.is_image(PROCESSED_NC)
def save(self, dataset_or_path: Union[str, xr.Dataset], new_filename: str = None) -> List[Any]: """Saves a local file to the datastream store. :param dataset_or_path: The dataset or local path to the file to save. The file should be named according to ME Data Standards naming conventions so that this method can automatically parse the datastream, date, and time from the file name. :type dataset_or_path: Union[str, xr.Dataset] :param new_filename: If provided, the new filename to save as. This parameter should ONLY be provided if using a local path for dataset_or_path. Must also follow ME Data Standards naming conventions. Defaults to None. :type new_filename: str, optional :return: A list of paths where the saved files were stored in storage. Path type is dependent upon the specific storage subclass. :rtype: List[Any] """ saved_paths = [] if isinstance(dataset_or_path, xr.Dataset): dataset = dataset_or_path # Save file for every registered output file type for file_extension in DatastreamStorage.output_file_extensions.values( ): dataset_filename = DSUtil.get_dataset_filename( dataset, file_extension=file_extension) with self.tmp.get_temp_filepath(dataset_filename) as tmp_path: FileHandler.write(dataset, tmp_path) saved_paths.append( self.save_local_path(tmp_path, new_filename)) else: local_path = dataset_or_path saved_paths.append(self.save_local_path(local_path, new_filename)) return saved_paths
def read_and_persist_raw_files(self, file_paths: List[str]) -> List[str]: """Renames the provided raw files according to ME Data Standards file naming conventions for raw data files, and returns a list of the paths to the renamed files. :param file_paths: A list of paths to the original raw files. :type file_paths: List[str] :return: A list of paths to the renamed files. :rtype: List[str] """ raw_dataset_mapping = {} if isinstance(file_paths, str): file_paths = [file_paths] for file_path in file_paths: # read the raw file into a dataset with self.storage.tmp.fetch(file_path) as tmp_path: dataset = FileHandler.read(tmp_path) # Don't use dataset if no FileHandler is registered for it if dataset is not None: # create the standardized name for raw file new_filename = DSUtil.get_raw_filename( dataset, tmp_path, self.config) # add the raw dataset to our dictionary raw_dataset_mapping[new_filename] = dataset # save the raw data to storage self.storage.save(tmp_path, new_filename) else: warnings.warn( f"Couldn't use extracted raw file: {tmp_path}") return raw_dataset_mapping
def add_colorbar(ax, plot, label): cb = plt.colorbar(plot, ax=ax, pad=0.01) cb.ax.set_ylabel(label, fontsize=12) cb.outline.set_linewidth(1) cb.ax.tick_params(size=0) cb.ax.minorticks_off() return cb # Useful variables ds = dataset date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y') cmap = sns.color_palette("viridis", as_cmap=True) colors = [cmap(0.00), cmap(0.60)] # Create the first plot -- Surface Met Parameters filename = DSUtil.get_plot_filename(dataset, "surface_met_parameters", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Define data and metadata data = [[ds.wind_speed, ds.wind_direction], [ds.pressure, ds.rh], [ds.air_temperature, ds.CTD_SST]] var_labels = [[ r"$\overline{\mathrm{U}}$ Cup", r"$\overline{\mathrm{\theta}}$ Cup" ], ["Pressure", "Relative Humidity"], ["Air Temperature", "Sea Surface Temperature"]] ax_labels = [[ r"$\overline{\mathrm{U}}$ (ms$^{-1}$)", r"$\bar{\mathrm{\theta}}$ (degrees)" ], [
def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None: """------------------------------------------------------------------- Hook to allow users to create plots from the xarray dataset after processing and QC have been applied and just before the dataset is saved to disk. To save on filesystem space (which is limited when running on the cloud via a lambda function), this method should only write one plot to local storage at a time. An example of how this could be done is below: ``` filename = DSUtil.get_plot_filename(dataset, "sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) ax.plot(dataset["time"].data, dataset["sea_level"].data) fig.savefig(tmp_path) self.storage.save(tmp_path) plt.close() filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) DSUtil.plot_qc(dataset, "sea_level", tmp_path) storage.save(tmp_path) ``` Args: dataset (xr.Dataset): The xarray dataset with customizations and QC applied. -------------------------------------------------------------------""" def format_time_xticks(ax, start=4, stop=21, step=4, date_format="%H-%M"): ax.xaxis.set_major_locator( mpl.dates.HourLocator(byhour=range(start, stop, step))) ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format)) plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha='center') # Useful variables ds = dataset date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y') # Create wave statistics plot filename = DSUtil.get_plot_filename(dataset, "wave_statistics", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Create figure and axes objects fig, axs = plt.subplots(nrows=3, figsize=(14, 8), constrained_layout=True) fig.suptitle( f"Wave Statistics at {ds.attrs['location_meaning']} on {date}") # Plot wave heights cmap = cmocean.cm.amp_r ds.average_wave_height.plot(ax=axs[0], c=cmap(0.10), linewidth=2, label=r"H$_{avg}$") ds.significant_wave_height.plot(ax=axs[0], c=cmap(0.5), linewidth=2, label=r"H$_{sig}$") ds.max_wave_height.plot(ax=axs[0], c=cmap(0.85), linewidth=2, label=r"H$_{max}$") axs[0].set_ylabel("Wave Height (m)") axs[0].legend(bbox_to_anchor=(1, -0.10), ncol=3) # Plot wave periods cmap = cmocean.cm.dense ds.average_wave_period.plot(ax=axs[1], c=cmap(0.15), linewidth=2, label=r"T$_{avg}$") ds.significant_wave_period.plot(ax=axs[1], c=cmap(0.5), linewidth=2, label=r"T$_{sig}$") ds.mean_wave_period.plot(ax=axs[1], c=cmap(0.8), linewidth=2, label=r"$\overline{T}_{mean}$") axs[1].set_ylabel("Wave Period (s)") axs[1].legend(bbox_to_anchor=(1, -0.10), ncol=3) # Plot mean direction cmap = cmocean.cm.haline ds.mean_wave_direction.plot(ax=axs[2], c=cmap(0.4), linewidth=2, label=r"$\overline{\phi}_{mean}$") axs[2].set_ylabel(r"Wave $\overline{\phi}$ (deg)") axs[2].legend(bbox_to_anchor=(1, -0.10)) # Set xlabels and ticks for i in range(3): axs[i].set_xlabel("Time (UTC)") format_time_xticks(axs[i]) # Save figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() return
def _is_image(x): return True if DSUtil.is_image(x.__str__()) else False
def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None: """------------------------------------------------------------------- Hook to allow users to create plots from the xarray dataset after processing and QC have been applied and just before the dataset is saved to disk. To save on filesystem space (which is limited when running on the cloud via a lambda function), this method should only write one plot to local storage at a time. An example of how this could be done is below: ``` filename = DSUtil.get_plot_filename(dataset, "sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) ax.plot(dataset["time"].data, dataset["sea_level"].data) fig.save(tmp_path) storage.save(tmp_path) filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) DSUtil.plot_qc(dataset, "sea_level", tmp_path) storage.save(tmp_path) ``` Args: dataset (xr.Dataset): The xarray dataset with customizations and QC applied. -------------------------------------------------------------------""" def format_time_xticks(ax, start=4, stop=21, step=4, date_format="%H-%M"): ax.xaxis.set_major_locator( mpl.dates.HourLocator(byhour=range(start, stop, step))) ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format)) plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha="center") def double_plot(ax, twin, data, colors, var_labels, ax_labels, **kwargs): def _add_lineplot(_ax, _data, _c, _label, _ax_label, _spine): _data.plot(ax=_ax, c=_c, label=_label, linewidth=2, **kwargs) _ax.tick_params(axis="y", which="both", colors=_c) _ax.set_ylabel(_ax_label, color=_c) _ax.spines[_spine].set_color(_c) _add_lineplot(ax, data[0], colors[0], var_labels[0], ax_labels[0], "left") _add_lineplot(twin, data[1], colors[1], var_labels[1], ax_labels[1], "right") twin.spines["left"].set_color( colors[0]) # twin overwrites ax, so set color here def add_colorbar(ax, plot, label): cb = plt.colorbar(plot, ax=ax, pad=0.01) cb.ax.set_ylabel(label, fontsize=12) cb.outline.set_linewidth(1) cb.ax.tick_params(size=0) cb.ax.minorticks_off() return cb # Useful variables ds = dataset date = pd.to_datetime(ds.time.data[0]).strftime("%d-%b-%Y") cmap = sns.color_palette("viridis", as_cmap=True) colors = [cmap(0.00), cmap(0.60)] # Create the first plot -- Surface Met Parameters filename = DSUtil.get_plot_filename(dataset, "surface_met_parameters", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Define data and metadata data = [ [ds.wind_speed, ds.wind_direction], [ds.pressure, ds.rh], [ds.air_temperature, ds.CTD_SST], ] var_labels = [ [ r"$\overline{\mathrm{U}}$ Cup", r"$\overline{\mathrm{\theta}}$ Cup" ], ["Pressure", "Relative Humidity"], ["Air Temperature", "Sea Surface Temperature"], ] ax_labels = [ [ r"$\overline{\mathrm{U}}$ (ms$^{-1}$)", r"$\bar{\mathrm{\theta}}$ (degrees)", ], [ r"$\overline{\mathrm{P}}$ (bar)", r"$\overline{\mathrm{RH}}$ (%)" ], [ r"$\overline{\mathrm{T}}_{air}$ ($\degree$C)", r"$\overline{\mathrm{SST}}$ ($\degree$C)", ], ] # Create figure and axes objects fig, axs = plt.subplots(nrows=3, figsize=(14, 8), constrained_layout=True) twins = [ax.twinx() for ax in axs] fig.suptitle( f"Surface Met Parameters at {ds.attrs['location_meaning']} on {date}" ) # Create the plots gill_data = [ds.gill_wind_speed, ds.gill_wind_direction] gill_labels = [ r"$\overline{\mathrm{U}}$ Gill", r"$\overline{\mathrm{\theta}}$ Gill", ] double_plot( axs[0], twins[0], data=gill_data, colors=colors, var_labels=gill_labels, linestyle="--", ax_labels=["", ""], ) for i in range(3): double_plot( axs[i], twins[i], data=data[i], colors=colors, var_labels=var_labels[i], ax_labels=ax_labels[i], ) axs[i].grid(which="both", color="lightgray", linewidth=0.5) lines = axs[i].lines + twins[i].lines labels = [line.get_label() for line in lines] axs[i].legend(lines, labels, ncol=len(labels), bbox_to_anchor=(1, -0.15)) format_time_xticks(axs[i]) axs[i].set_xlabel("Time (UTC)") twins[0].set_ylim(0, 360) # Save and close the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() # Create the second plot -- Conductivity and Sea Surface Temperature filename = DSUtil.get_plot_filename(dataset, "conductivity", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Define data and metadata data = [ds.conductivity, ds.CTD_SST] var_labels = [ r"Conductivity (S m$^{-1}$)", r"$\overline{\mathrm{SST}}$ ($\degree$C)", ] ax_labels = [ r"Conductivity (S m$^{-1}$)", r"$\overline{\mathrm{SST}}$ ($\degree$C)", ] # Create the figure and axes objects fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True) fig.suptitle( f"Conductivity and Sea Surface Temperature at {ds.attrs['location_meaning']} on {date}" ) twin = ax.twinx() # Make the plot double_plot( ax, twin, data=data, colors=colors, var_labels=var_labels, ax_labels=ax_labels, ) # Set the labels and ticks ax.grid(which="both", color="lightgray", linewidth=0.5) lines = ax.lines + twin.lines labels = [line.get_label() for line in lines] ax.legend(lines, labels, ncol=len(labels), bbox_to_anchor=(1, -0.03)) format_time_xticks(ax) ax.set_xlabel("Time (UTC)") # Save and close the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() # Create the third plot - current speed and direction filename = DSUtil.get_plot_filename(dataset, "current_velocity", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Reduce dimensionality of dataset for plotting ds_1H: xr.Dataset = ds.reindex({"depth": ds.depth.data[::2]}) ds_1H: xr.Dataset = ds_1H.resample(time="1H").nearest() # Calculations for contour plots levels = 30 # Calculations for quiver plot qv_slice = slice( 1, None) # Skip first to prevent weird overlap with axes borders qv_degrees = ds_1H.current_direction.data[qv_slice, qv_slice].transpose() qv_theta = (qv_degrees + 90) * (np.pi / 180) X, Y = ds_1H.time.data[qv_slice], ds_1H.depth.data[qv_slice] U, V = np.cos(-qv_theta), np.sin(-qv_theta) # Create figure and axes objects fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True) fig.suptitle( f"Current Speed and Direction at {ds.attrs['location_meaning']} on {date}" ) # Make the plots csf = ds.current_speed.plot.contourf( ax=ax, x="time", yincrease=False, levels=levels, cmap=cmocean.cm.deep_r, add_colorbar=False, ) # ds.current_speed.plot.contour(ax=ax, x="time", yincrease=False, levels=levels, colors="lightgray", linewidths=0.5) ax.quiver( X, Y, U, V, width=0.002, scale=60, color="white", pivot="middle", zorder=10, ) add_colorbar(ax, csf, r"Current Speed (mm s$^{-1}$)") # Set the labels and ticks format_time_xticks(ax) ax.set_xlabel("Time (UTC)") ax.set_ylabel("Depth (m)") # Save the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() return
def hook_generate_and_persist_plots(self, dataset: xr.Dataset): def format_time_xticks(ax, start=4, stop=21, step=4, date_format="%H-%M"): ax.xaxis.set_major_locator( mpl.dates.HourLocator(byhour=range(start, stop, step))) ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format)) plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha="center") def add_colorbar(ax, plot, label): cb = plt.colorbar(plot, ax=ax, pad=0.01) cb.ax.set_ylabel(label, fontsize=12) cb.outline.set_linewidth(1) cb.ax.tick_params(size=0) cb.ax.minorticks_off() return cb ds = dataset date = pd.to_datetime(ds.time.data[0]).strftime("%d-%b-%Y") # Colormaps to use wind_cmap = cmocean.cm.deep_r avail_cmap = cmocean.cm.amp_r # Create the first plot - Lidar Wind Speeds at several elevations filename = DSUtil.get_plot_filename(dataset, "wind_speeds", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Create the figure and axes objects fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 8), constrained_layout=True) fig.suptitle( f"Wind Speed Time Series at {ds.attrs['location_meaning']} on {date}" ) # Select heights to plot heights = [40, 90, 140, 200] # Plot the data for i, height in enumerate(heights): velocity = ds.wind_speed.sel(height=height) velocity.plot( ax=ax, linewidth=2, c=wind_cmap(i / len(heights)), label=f"{height} m", ) # Set the labels and ticks format_time_xticks(ax) ax.legend(facecolor="white", ncol=len(heights), bbox_to_anchor=(1, -0.05)) ax.set_title("") # Remove bogus title created by xarray ax.set_xlabel("Time (UTC)") ax.set_ylabel(r"Wind Speed (ms$^{-1}$)") # Save the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() filename = DSUtil.get_plot_filename(dataset, "wind_speed_and_direction", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: # Reduce dimensionality of dataset for plotting ds_1H: xr.Dataset = ds.resample(time="1H").nearest() # Calculations for contour plots levels = 30 # Calculations for quiver plot qv_slice = slice( 1, None) # Skip first to prevent weird overlap with axes borders qv_degrees = ds_1H.wind_direction.data[qv_slice].transpose() qv_theta = (qv_degrees + 90) * (np.pi / 180) X, Y = ds_1H.time.data[qv_slice], ds_1H.height.data U, V = np.cos(-qv_theta), np.sin(-qv_theta) # Create figure and axes objects fig, axs = plt.subplots(nrows=2, figsize=(14, 8), constrained_layout=True) fig.suptitle( f"Wind Speed and Direction at {ds.attrs['location_meaning']} on {date}" ) # Make top subplot -- contours and quiver plots for wind speed and direction csf = ds.wind_speed.plot.contourf(ax=axs[0], x="time", levels=levels, cmap=wind_cmap, add_colorbar=False) # ds.wind_speed.plot.contour(ax=axs[0], x="time", levels=levels, colors="lightgray", linewidths=0.5) axs[0].quiver( X, Y, U, V, width=0.002, scale=60, color="white", pivot="middle", zorder=10, ) add_colorbar(axs[0], csf, r"Wind Speed (ms$^{-1}$)") # Make bottom subplot -- heatmap for data availability da = ds.data_availability.plot( ax=axs[1], x="time", cmap=avail_cmap, add_colorbar=False, vmin=0, vmax=100, ) add_colorbar(axs[1], da, "Availability (%)") # Set the labels and ticks for i in range(2): format_time_xticks(axs[i]) axs[i].set_xlabel("Time (UTC)") axs[i].set_ylabel("Height ASL (m)") # Save the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() return
def test_end_time_is_correct(raw_dataset, dataset, config): expected = ("20211001", "000002") time_definition = config.dataset_definition.get_variable("time") assert DSUtil.get_raw_end_time(raw_dataset, time_definition) == expected assert DSUtil.get_end_time(dataset) == expected
def test_datastream_name_retrieved_from_config(dataset, config): datastream_name = dataset.attrs.get("datastream_name") assert DSUtil.get_datastream_name(config=config) == datastream_name
def test_datetime64_is_converted_to_string(): time_str = "2020-01-01 00:00:00" datetime64 = np.datetime64(time_str) assert DSUtil.datetime64_to_string(datetime64) == ("20200101", "000000")
def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None: """------------------------------------------------------------------- Hook to allow users to create plots from the xarray dataset after processing and QC have been applied and just before the dataset is saved to disk. To save on filesystem space (which is limited when running on the cloud via a lambda function), this method should only write one plot to local storage at a time. An example of how this could be done is below: ``` filename = DSUtil.get_plot_filename(dataset, "sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) ax.plot(dataset["time"].data, dataset["sea_level"].data) fig.savefig(tmp_path) self.storage.save(tmp_path) plt.close() filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(10,5)) DSUtil.plot_qc(dataset, "sea_level", tmp_path) storage.save(tmp_path) ``` Args: dataset (xr.Dataset): The xarray dataset with customizations and QC applied. -------------------------------------------------------------------""" ds = dataset # Useful values location = ds.attrs["location_meaning"] date1, date2 = pd.to_datetime(ds.time.data[0]), pd.to_datetime( ds.time.data[-1]) hhmm1, hhmm2 = date1.strftime("%H:%M"), date2.strftime("%H:%M") date = date1.strftime("%d-%b-%Y") filename = DSUtil.get_plot_filename(dataset, "buoy_motion_histogram", "png") with self.storage._tmp.get_temp_filepath(filename) as tmp_path: fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True) # Create plot labels including mean roll/pitch mean_roll, mean_pitch = ds["roll"].mean().data, ds["pitch"].mean( ).data roll_label = (r"$\.{\theta}_{roll}$ [$\overline{\theta}_r$ =" + f"{mean_roll:.3f} deg]") pitch_label = (r"$\.{\theta}_{pitch}$ [$\overline{\theta}_p$ =" + f"{mean_pitch:.3f} deg]") # Plot the stepped ds["roll"].plot.hist(ax=ax, linewidth=2, edgecolor="black", histtype="step", label=roll_label) ds["pitch"].plot.hist(ax=ax, linewidth=2, edgecolor="red", histtype="step", label=pitch_label) # Set axes and figure labels fig.suptitle( f"Buoy Motion Histogram at {location} on {date} from {hhmm1} to {hhmm2}" ) ax.set_xlabel("Buoy Motion (deg)") ax.set_ylabel("Frequency") ax.set_title("") ax.legend(ncol=2, bbox_to_anchor=(1, -0.04)) # Save the figure fig.savefig(tmp_path, dpi=100) self.storage.save(tmp_path) plt.close() return