Exemplo n.º 1
0
    def get_previous_dataset(self, dataset: xr.Dataset) -> xr.Dataset:
        """Utility method to retrieve the previous set of data for hte same
        datastream as the provided dataset from the DatastreamStorage.

        :param dataset:
            The reference dataset that will be used to search the
            DatastreamStore for prior data.
        :type dataset: xr.Dataset
        :return:
            The previous dataset from the DatastreamStorage if it exists,
            otherwise None.
        :rtype: xr.Dataset
        """
        prev_dataset = None
        start_date, start_time = DSUtil.get_start_time(dataset)
        datastream_name = DSUtil.get_datastream_name(dataset, self.config)

        with self.storage.fetch_previous_file(
            datastream_name, f"{start_date}.{start_time}"
        ) as netcdf_file:
            if netcdf_file:
                prev_dataset = self.storage.handlers.read(
                    file=netcdf_file, name=netcdf_file
                )

        return prev_dataset
Exemplo n.º 2
0
def test_corrections_are_recorded(dataset):
    DSUtil.record_corrections_applied(
        ds=dataset,
        variable="uninitialized_var",
        correction="Variable was initialized to _FillValue",
    )
    assert ATTS.CORRECTIONS_APPLIED in dataset["uninitialized_var"].attrs
Exemplo n.º 3
0
    def _replace_invalid_values(self, fill_value, variable_name: str):
        valid_min = DSUtil.get_valid_min(self.ds, variable_name)
        valid_max = DSUtil.get_valid_max(self.ds, variable_name)

        if valid_min is not None and valid_max is not None:
            values = self.ds[variable_name].values
            keep_array = np.logical_not( (values < valid_min) | (values > valid_max))
            replaced_values = np.where(keep_array, values, fill_value)
            self.ds[variable_name].data = replaced_values
Exemplo n.º 4
0
    def save_local_path(self, local_path: str, new_filename: str = None) -> Any:
        # TODO: we should perform a REGEX check to make sure that the filename is valid
        filename = os.path.basename(local_path) if not new_filename else new_filename
        datastream_name = DSUtil.get_datastream_name_from_filename(filename)

        dest_dir = DSUtil.get_datastream_directory(datastream_name=datastream_name, root=self._root)
        os.makedirs(dest_dir, exist_ok=True)  # make sure the dest folder exists
        dest_path = os.path.join(dest_dir, filename)

        shutil.copy(local_path, dest_path)
        return dest_path
Exemplo n.º 5
0
    def save_local_path(self, local_path: str, new_filename: str = None):
        # TODO: we should perform a REGEX check to make sure that the filename is valid
        filename = os.path.basename(
            local_path) if not new_filename else new_filename
        datastream_name = DSUtil.get_datastream_name_from_filename(filename)

        subpath = DSUtil.get_datastream_directory(
            datastream_name=datastream_name)
        s3_path = self.root.join(subpath, filename)

        self.tmp.upload(local_path, s3_path)
        return s3_path
Exemplo n.º 6
0
Arquivo: qc.py Projeto: tsdat/tsdat
    def __init__(
        self,
        ds: xr.Dataset,
        config: Config,
        definition: QualityManagerDefinition,
        previous_data: xr.Dataset,
    ):

        # Get the variables this quality manager applies to
        variable_names = definition.variables

        # Convert the list to upper case in case the user made a typo in the yaml
        variable_names_upper = [x.upper() for x in variable_names]

        # Add variables where a keyword was used
        if VARS.COORDS in variable_names_upper:
            variable_names.remove(VARS.COORDS)
            variable_names.extend(DSUtil.get_coordinate_variable_names(ds))

        if VARS.DATA_VARS in variable_names_upper:
            variable_names.remove(VARS.DATA_VARS)
            variable_names.extend(DSUtil.get_non_qc_variable_names(ds))

        if VARS.ALL in variable_names_upper:
            variable_names.remove(VARS.ALL)
            variable_names.extend(DSUtil.get_coordinate_variable_names(ds))
            variable_names.extend(DSUtil.get_non_qc_variable_names(ds))

        # Remove any duplicates while preserving insertion order
        variable_names = list(dict.fromkeys(variable_names))

        # Exclude any excludes
        excludes = definition.exclude
        for exclude in excludes:
            variable_names.remove(exclude)

        # Get the quality checker
        quality_checker = instantiate_handler(ds,
                                              previous_data,
                                              definition,
                                              handler_desc=definition.checker)

        # Get the quality handlers
        handlers = definition.handlers

        self.ds = ds
        self.config = config
        self.variable_names = variable_names
        self.checker = quality_checker
        self.handlers = handlers
        self.definition: QualityManagerDefinition = definition
        self.previous_data = previous_data
Exemplo n.º 7
0
    def record_correction(self, variable_name: str):
        """If a correction was made to variable data to fix invalid values
        as detected by a quality check, this method will record the fix
        to the appropriate variable attribute.  The correction description
        will come from the handler params which get set in the pipeline config
        file.

        :param variable_name: Name
        :type variable_name: str
        """
        correction = self.params.get("correction", None)
        if correction is not None:
            DSUtil.record_corrections_applied(self.ds, variable_name, correction)
Exemplo n.º 8
0
    def create_and_persist_plots(self, dataset: xr.Dataset):

        ds = dataset

        filename = DSUtil.get_plot_filename(dataset, "Three Phase Voltage",
                                            "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Calculations for contour plots
            date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y')
            #hi = np.ceil(ds.wind_speed.max().data + 1)
            #lo = np.floor(ds.wind_speed.min().data)
            #levels = np.arange(lo, hi, 1)

            # Colormaps to use
            #wind_cmap = cmocean.cm.deep_r
            #avail_cmap = cmocean.cm.amp_r

            # Create figure and axes objects
            fig, ax = plt.subplots(figsize=(16, 8), constrained_layout=True)
            fig.suptitle(
                f"Three Phase Voltage from {ds.attrs['title']} on {date}")

            ds.MODAQ_Va[:100].plot(ax=ax)
            ds.MODAQ_Vb[:100].plot(ax=ax)
            ds.MODAQ_Vc[:100].plot(ax=ax)

            # Save the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        return
Exemplo n.º 9
0
    def run(self, variable_name: str) -> Optional[np.ndarray]:

        # If this is a time variable, we check for 'NaT'
        if self.ds[variable_name].data.dtype.type == np.datetime64:
            results_array = np.isnat(self.ds[variable_name].data)

        else:
            fill_value = DSUtil.get_fill_value(self.ds, variable_name)

            # If the variable has no _FillValue attribute, then
            # we select a default value to use
            if fill_value is None:
                fill_value = -9999

            # Make sure fill value has same data type as the variable
            fill_value = np.array(fill_value,
                                  dtype=self.ds[variable_name].data.dtype.type)

            # First check if any values are assigned to _FillValue
            results_array = np.equal(self.ds[variable_name].data, fill_value)

            # Then, if the value is numeric, we should also check if any values are assigned to NaN
            if self.ds[variable_name].data.dtype.type in (
                    type(0.0),
                    np.float16,
                    np.float32,
                    np.float64,
            ):
                results_array |= np.isnan(self.ds[variable_name].data)

        return results_array
Exemplo n.º 10
0
    def read_and_persist_raw_files(
        self, filepaths: Union[str, List[str]]
    ) -> Dict[str, xr.Dataset]:
        """------------------------------------------------------------------------------------
        Renames the provided raw files according to our naming conventions and returns a
        mapping of the renamed filepaths to raw `xr.Dataset` objects.

        Args:
            file_paths (List[str]): The path(s) to the raw file(s).

        Returns:
            Dict[str, xr.Dataset]: The mapping of raw filepaths to raw xr.Dataset objects.

        ------------------------------------------------------------------------------------"""
        raw_mapping: Dict[str, xr.Dataset] = dict()

        if isinstance(filepaths, str):
            filepaths = [filepaths]

        for filepath in filepaths:

            extracted = self.storage.handlers.read(file=filepath, name=filepath)
            if not extracted:
                warnings.warn(f"Couldn't use extracted raw file: {filepath}")
                continue

            new_filename = DSUtil.get_raw_filename(extracted, filepath, self.config)
            self.storage.save(filepath, new_filename=new_filename)

            if isinstance(extracted, xr.Dataset):
                extracted = {new_filename: extracted}

            raw_mapping.update(extracted)

        return raw_mapping
Exemplo n.º 11
0
    def fetch_previous_file(self, datastream_name: str,
                            start_time: str) -> DisposableLocalTempFile:
        # fetch files one day previous and one day after start date (since find is exclusive)
        date = datetime.datetime.strptime(start_time, "%Y%m%d.%H%M%S")
        prev_date = (date -
                     datetime.timedelta(days=1)).strftime("%Y%m%d.%H%M%S")
        next_date = (date +
                     datetime.timedelta(days=1)).strftime("%Y%m%d.%H%M%S")
        files = self.find(
            datastream_name,
            prev_date,
            next_date,
            filetype=DatastreamStorage.default_file_type,
        )
        dates = [DSUtil.get_date_from_filename(_file) for _file in files]

        previous_filepath = None
        if dates:
            i = bisect.bisect_left(dates, start_time)
            if i > 0:
                previous_filepath = files[i - 1]

        if previous_filepath is None:
            return DisposableLocalTempFile(previous_filepath)

        return self._tmp.fetch(previous_filepath)
Exemplo n.º 12
0
    def run(self, variable_name: str) -> Optional[np.ndarray]:

        # If this is a time variable, we check for 'NaT'
        if self.ds[variable_name].values.dtype.type == np.datetime64:
            results_array = np.isnat(self.ds[variable_name].values)

        else:
            fill_value = DSUtil.get_fill_value(self.ds, variable_name)

            # If the variable has no _FillValue attribute, then
            # we select a default value to use
            if fill_value is None:
                fill_value = -9999

            # Make sure fill value has same data type as the variable
            fill_value = np.array(fill_value, dtype=self.ds[variable_name].values.dtype.type)

            # First replace any values that are outside valid_range to be fill_value so
            # it will get flagged as missing
            self._replace_invalid_values(fill_value, variable_name)

            # First check if any values are assigned to _FillValue
            results_array = np.equal(self.ds[variable_name].values, fill_value)

            # Then, if the value is numeric, we should also check if any values are assigned to
            # NaN
            if self.ds[variable_name].values.dtype.type in (type(0.0), np.float16, np.float32, np.float64):
                results_array |= np.isnan(self.ds[variable_name].values)

        return results_array
Exemplo n.º 13
0
    def write(self, ds: xr.Dataset, filename: str, config: Config = None, **kwargs) -> None:
        """Saves the given dataset to a csv file. 

        :param ds: The dataset to save.
        :type ds: xr.Dataset
        :param filename: The path to where the file should be written to.
        :type filename: str
        :param config: Optional Config object, defaults to None
        :type config: Config, optional
        """
        if len(ds.dims) > 1:
            raise TypeError("Dataset has more than one dimension, so it can't be saved to csv.  Try netcdf instead.")

        write_params = self.parameters.get('write', {})
        to_dataframe_kwargs = write_params.get('to_dataframe', {})
        to_csv_kwargs = dict(index=False)
        to_csv_kwargs.update(write_params.get('to_csv', {}))

        df = ds.to_dataframe(**to_dataframe_kwargs)
        df.to_csv(filename, **to_csv_kwargs)

        yaml_filename = f"{filename}.yaml"
        with open(yaml_filename, 'w') as file:
            metadata = DSUtil.get_metadata(ds)
            yaml.dump(metadata, file)
Exemplo n.º 14
0
    def find(self, datastream_name: str, start_time: str, end_time: str,
             filetype: str = None) -> List[str]:
        # TODO: think about refactoring so that you don't need both start and end time
        # TODO: if times don't include hours/min/sec, then add .000000 to the string
        dir_to_check = DSUtil.get_datastream_directory(datastream_name=datastream_name, root=self._root)
        storage_paths = []

        if os.path.isdir(dir_to_check):
            for file in os.listdir(dir_to_check):
                if start_time <= DSUtil.get_date_from_filename(file) < end_time:
                    storage_paths.append(os.path.join(dir_to_check, file))

            if filetype is not None:
                filter_func = DatastreamStorage.file_filters[filetype]
                storage_paths = list(filter(filter_func, storage_paths))

        return sorted(storage_paths)
Exemplo n.º 15
0
    def run(self, variable_name: str, results_array: np.ndarray):
        if results_array.any():
            fill_value = DSUtil.get_fill_value(self.ds, variable_name)
            keep_array = np.logical_not(results_array)

            var_values = self.ds[variable_name].data
            replaced_values = np.where(keep_array, var_values, fill_value)
            self.ds[variable_name].data = replaced_values

            self.record_correction(variable_name)
Exemplo n.º 16
0
    def run(self, variable_name: str) -> Optional[np.ndarray]:

        valid_delta = self.ds[variable_name].attrs.get(ATTS.VALID_DELTA, None)

        # If no valid_delta is available, then we just skip this definition
        results_array = None
        if valid_delta is not None:
            # We need to get the dim to diff on from the parameters
            # If dim is not specified, then we use the first dim for the variable
            dim = self.params.get('dim', None)

            if dim is None and len(self.ds[variable_name].dims) > 0:
                dim = self.ds[variable_name].dims[0]

            if dim is not None:
                # If previous data exists, then we must add the last row of
                # previous data as the first row of the variable's data array.
                # This is so that the diff function can compare the first value
                # of the file to make sure it is consistent with the previous file.

                # convert to np array
                variable_data = self.ds[variable_name].data
                axis = self.ds[variable_name].get_axis_num(dim)
                previous_row = None

                # Load the previous row from the other dataset
                if self.previous_data is not None:
                    previous_variable_data = self.previous_data.get(variable_name, None)
                    if previous_variable_data is not None:
                        # convert to np array
                        previous_variable_data = previous_variable_data.data

                        # Get the last value from the first axis
                        previous_row = previous_variable_data[-1]

                        # Insert that value as the first value of the first axis
                        variable_data = np.insert(variable_data, 0, previous_row, axis=axis)

                # If the variable is a time variable, then we convert to nanoseconds before doing our check
                if self.ds[variable_name].values.dtype.type == np.datetime64:
                    variable_data = DSUtil.datetime64_to_timestamp(variable_data)

                # Compute the difference between each two numbers and check if it exceeds valid_delta
                diff = np.absolute(np.diff(variable_data, axis=axis))
                results_array = np.greater(diff, valid_delta)

                if previous_row is None:
                    # This means our results array is missing one value for the first row, which is
                    # not included in the diff computation.
                    # We need to add False for the first row of results, since it won't fail
                    # the check.
                    first_row = np.zeros(results_array[0].size, dtype=bool)
                    results_array = np.insert(results_array, 0, first_row, axis=axis)

        return results_array
Exemplo n.º 17
0
    def run(self, variable_name: str) -> Optional[np.ndarray]:

        results_array = None
        # We need to get the dim to diff on from the parameters
        # If dim is not specified, then we use the first dim for the variable
        dim = self.params.get("dim", None)

        if dim is None and len(self.ds[variable_name].dims) > 0:
            dim = self.ds[variable_name].dims[0]

        if dim is not None:
            # If previous data exists, then we must add the last row of
            # previous data as the first row of the variable's data array.
            # This is so that the diff function can compare the first value
            # of the file to make sure it is consistent with the previous file.

            # convert to np array
            variable_data = self.ds[variable_name].data
            axis = self.ds[variable_name].get_axis_num(dim)
            previous_row = None

            # Load the previous row from the other dataset
            if self.previous_data is not None and dim == "time":
                previous_variable_data = self.previous_data.get(
                    variable_name, None)
                if previous_variable_data is not None:
                    # convert to np array
                    previous_variable_data = previous_variable_data.data

                    # Get the last value from the first axis
                    previous_row = previous_variable_data[-1]

                    # Insert that value as the first value of the first axis
                    variable_data = np.insert(variable_data,
                                              0,
                                              previous_row,
                                              axis=axis)

            # If the variable is a time variable, then we convert to nanoseconds before doing our check
            if self.ds[variable_name].data.dtype.type == np.datetime64:
                variable_data = DSUtil.datetime64_to_timestamp(variable_data)

            # Compute the difference between each two numbers and check if they are either all
            # increasing or all decreasing
            diff = np.diff(variable_data, axis=axis)
            is_monotonic = np.all(diff > 0) | np.all(
                diff < 0)  # this returns a scalar

            # Create a results array, with all values set to the results of the is_monotonic check
            results_array = np.full(variable_data.shape,
                                    not is_monotonic,
                                    dtype=bool)

        return results_array
Exemplo n.º 18
0
    def find(self,
             datastream_name: str,
             start_time: str,
             end_time: str,
             filetype: str = None) -> List[S3Path]:
        # TODO: think about refactoring so that you don't need both start and end time
        # TODO: if times don't include hours/min/sec, then add .000000 to the string
        subpath = DSUtil.get_datastream_directory(
            datastream_name=datastream_name)
        dir_to_check = self.root.join(subpath)
        storage_paths = []

        for file in self.tmp.listdir(dir_to_check):
            if start_time <= DSUtil.get_date_from_filename(
                    file.bucket_path) < end_time:
                storage_paths.append(file)

        if filetype is not None:
            filter_func = DatastreamStorage.file_filters[filetype]
            storage_paths = list(filter(filter_func, storage_paths))

        return sorted(storage_paths)
Exemplo n.º 19
0
def test_plotting_utilities(dataset):
    expected_filename = "test.SortedDataset.a1.20211001.000000.height.png"
    filename = DSUtil.get_plot_filename(dataset, "height", "png")
    filepath = os.path.join(STORAGE_PATH, "test.SortedDataset.a1", filename)
    assert filename == expected_filename

    assert DSUtil.get_date_from_filename(filepath) == "20211001.000000"

    DSUtil.plot_qc(dataset, "height_out", filepath)

    assert DSUtil.is_image(filepath)
    assert not DSUtil.is_image(PROCESSED_NC)
Exemplo n.º 20
0
    def save(self,
             dataset_or_path: Union[str, xr.Dataset],
             new_filename: str = None) -> List[Any]:
        """Saves a local file to the datastream store.

        :param dataset_or_path: The dataset or local path to the file
            to save.  The file should be named according
            to ME Data Standards naming conventions so that this
            method can automatically parse the datastream,
            date, and time from the file name.
        :type dataset_or_path: Union[str, xr.Dataset]
        :param new_filename: If provided, the new filename to save as.
            This parameter should ONLY be provided if using
            a local path for dataset_or_path.  Must also
            follow ME Data Standards naming conventions. Defaults to None.
        :type new_filename: str, optional
        :return: A list of paths where the saved files were stored in storage.
            Path type is dependent upon the specific storage subclass.
        :rtype: List[Any]
        """
        saved_paths = []

        if isinstance(dataset_or_path, xr.Dataset):
            dataset = dataset_or_path

            # Save file for every registered output file type
            for file_extension in DatastreamStorage.output_file_extensions.values(
            ):
                dataset_filename = DSUtil.get_dataset_filename(
                    dataset, file_extension=file_extension)
                with self.tmp.get_temp_filepath(dataset_filename) as tmp_path:
                    FileHandler.write(dataset, tmp_path)
                    saved_paths.append(
                        self.save_local_path(tmp_path, new_filename))

        else:
            local_path = dataset_or_path
            saved_paths.append(self.save_local_path(local_path, new_filename))

        return saved_paths
Exemplo n.º 21
0
    def read_and_persist_raw_files(self, file_paths: List[str]) -> List[str]:
        """Renames the provided raw files according to ME Data Standards file
        naming conventions for raw data files, and returns a list of the paths
        to the renamed files.

        :param file_paths: A list of paths to the original raw files.
        :type file_paths: List[str]
        :return: A list of paths to the renamed files.
        :rtype: List[str]
        """
        raw_dataset_mapping = {}

        if isinstance(file_paths, str):
            file_paths = [file_paths]

        for file_path in file_paths:

            # read the raw file into a dataset
            with self.storage.tmp.fetch(file_path) as tmp_path:
                dataset = FileHandler.read(tmp_path)

                # Don't use dataset if no FileHandler is registered for it
                if dataset is not None:
                    # create the standardized name for raw file
                    new_filename = DSUtil.get_raw_filename(
                        dataset, tmp_path, self.config)

                    # add the raw dataset to our dictionary
                    raw_dataset_mapping[new_filename] = dataset

                    # save the raw data to storage
                    self.storage.save(tmp_path, new_filename)

                else:
                    warnings.warn(
                        f"Couldn't use extracted raw file: {tmp_path}")

        return raw_dataset_mapping
Exemplo n.º 22
0
        def add_colorbar(ax, plot, label):
            cb = plt.colorbar(plot, ax=ax, pad=0.01)
            cb.ax.set_ylabel(label, fontsize=12)
            cb.outline.set_linewidth(1)
            cb.ax.tick_params(size=0)
            cb.ax.minorticks_off()
            return cb

        # Useful variables
        ds = dataset
        date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y')
        cmap = sns.color_palette("viridis", as_cmap=True)
        colors = [cmap(0.00), cmap(0.60)]

        # Create the first plot -- Surface Met Parameters
        filename = DSUtil.get_plot_filename(dataset, "surface_met_parameters",
                                            "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Define data and metadata
            data = [[ds.wind_speed, ds.wind_direction], [ds.pressure, ds.rh],
                    [ds.air_temperature, ds.CTD_SST]]
            var_labels = [[
                r"$\overline{\mathrm{U}}$ Cup",
                r"$\overline{\mathrm{\theta}}$ Cup"
            ], ["Pressure", "Relative Humidity"],
                          ["Air Temperature", "Sea Surface Temperature"]]
            ax_labels = [[
                r"$\overline{\mathrm{U}}$ (ms$^{-1}$)",
                r"$\bar{\mathrm{\theta}}$ (degrees)"
            ],
                         [
Exemplo n.º 23
0
    def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None:
        """-------------------------------------------------------------------
        Hook to allow users to create plots from the xarray dataset after
        processing and QC have been applied and just before the dataset is
        saved to disk.

        To save on filesystem space (which is limited when running on the
        cloud via a lambda function), this method should only
        write one plot to local storage at a time. An example of how this
        could be done is below:

        ```
        filename = DSUtil.get_plot_filename(dataset, "sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            ax.plot(dataset["time"].data, dataset["sea_level"].data)
            fig.savefig(tmp_path)
            self.storage.save(tmp_path)
            plt.close()

        filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            DSUtil.plot_qc(dataset, "sea_level", tmp_path)
            storage.save(tmp_path)
        ```

        Args:
            dataset (xr.Dataset):   The xarray dataset with customizations and
                                    QC applied.
        -------------------------------------------------------------------"""
        def format_time_xticks(ax,
                               start=4,
                               stop=21,
                               step=4,
                               date_format="%H-%M"):
            ax.xaxis.set_major_locator(
                mpl.dates.HourLocator(byhour=range(start, stop, step)))
            ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha='center')

        # Useful variables
        ds = dataset
        date = pd.to_datetime(ds.time.data[0]).strftime('%d-%b-%Y')

        # Create wave statistics plot
        filename = DSUtil.get_plot_filename(dataset, "wave_statistics", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Create figure and axes objects
            fig, axs = plt.subplots(nrows=3,
                                    figsize=(14, 8),
                                    constrained_layout=True)
            fig.suptitle(
                f"Wave Statistics at {ds.attrs['location_meaning']} on {date}")

            # Plot wave heights
            cmap = cmocean.cm.amp_r
            ds.average_wave_height.plot(ax=axs[0],
                                        c=cmap(0.10),
                                        linewidth=2,
                                        label=r"H$_{avg}$")
            ds.significant_wave_height.plot(ax=axs[0],
                                            c=cmap(0.5),
                                            linewidth=2,
                                            label=r"H$_{sig}$")
            ds.max_wave_height.plot(ax=axs[0],
                                    c=cmap(0.85),
                                    linewidth=2,
                                    label=r"H$_{max}$")
            axs[0].set_ylabel("Wave Height (m)")
            axs[0].legend(bbox_to_anchor=(1, -0.10), ncol=3)

            # Plot wave periods
            cmap = cmocean.cm.dense
            ds.average_wave_period.plot(ax=axs[1],
                                        c=cmap(0.15),
                                        linewidth=2,
                                        label=r"T$_{avg}$")
            ds.significant_wave_period.plot(ax=axs[1],
                                            c=cmap(0.5),
                                            linewidth=2,
                                            label=r"T$_{sig}$")
            ds.mean_wave_period.plot(ax=axs[1],
                                     c=cmap(0.8),
                                     linewidth=2,
                                     label=r"$\overline{T}_{mean}$")
            axs[1].set_ylabel("Wave Period (s)")
            axs[1].legend(bbox_to_anchor=(1, -0.10), ncol=3)

            # Plot mean direction
            cmap = cmocean.cm.haline
            ds.mean_wave_direction.plot(ax=axs[2],
                                        c=cmap(0.4),
                                        linewidth=2,
                                        label=r"$\overline{\phi}_{mean}$")
            axs[2].set_ylabel(r"Wave $\overline{\phi}$ (deg)")
            axs[2].legend(bbox_to_anchor=(1, -0.10))

            # Set xlabels and ticks
            for i in range(3):
                axs[i].set_xlabel("Time (UTC)")
                format_time_xticks(axs[i])

            # Save figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        return
Exemplo n.º 24
0
def _is_image(x):
    return True if DSUtil.is_image(x.__str__()) else False
Exemplo n.º 25
0
    def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None:
        """-------------------------------------------------------------------
        Hook to allow users to create plots from the xarray dataset after
        processing and QC have been applied and just before the dataset is
        saved to disk.

        To save on filesystem space (which is limited when running on the
        cloud via a lambda function), this method should only
        write one plot to local storage at a time. An example of how this
        could be done is below:

        ```
        filename = DSUtil.get_plot_filename(dataset, "sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            ax.plot(dataset["time"].data, dataset["sea_level"].data)
            fig.save(tmp_path)
            storage.save(tmp_path)

        filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            DSUtil.plot_qc(dataset, "sea_level", tmp_path)
            storage.save(tmp_path)
        ```

        Args:
            dataset (xr.Dataset):   The xarray dataset with customizations and
                                    QC applied.
        -------------------------------------------------------------------"""
        def format_time_xticks(ax,
                               start=4,
                               stop=21,
                               step=4,
                               date_format="%H-%M"):
            ax.xaxis.set_major_locator(
                mpl.dates.HourLocator(byhour=range(start, stop, step)))
            ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha="center")

        def double_plot(ax, twin, data, colors, var_labels, ax_labels,
                        **kwargs):
            def _add_lineplot(_ax, _data, _c, _label, _ax_label, _spine):
                _data.plot(ax=_ax, c=_c, label=_label, linewidth=2, **kwargs)
                _ax.tick_params(axis="y", which="both", colors=_c)
                _ax.set_ylabel(_ax_label, color=_c)
                _ax.spines[_spine].set_color(_c)

            _add_lineplot(ax, data[0], colors[0], var_labels[0], ax_labels[0],
                          "left")
            _add_lineplot(twin, data[1], colors[1], var_labels[1],
                          ax_labels[1], "right")
            twin.spines["left"].set_color(
                colors[0])  # twin overwrites ax, so set color here

        def add_colorbar(ax, plot, label):
            cb = plt.colorbar(plot, ax=ax, pad=0.01)
            cb.ax.set_ylabel(label, fontsize=12)
            cb.outline.set_linewidth(1)
            cb.ax.tick_params(size=0)
            cb.ax.minorticks_off()
            return cb

        # Useful variables
        ds = dataset
        date = pd.to_datetime(ds.time.data[0]).strftime("%d-%b-%Y")
        cmap = sns.color_palette("viridis", as_cmap=True)
        colors = [cmap(0.00), cmap(0.60)]

        # Create the first plot -- Surface Met Parameters
        filename = DSUtil.get_plot_filename(dataset, "surface_met_parameters",
                                            "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Define data and metadata
            data = [
                [ds.wind_speed, ds.wind_direction],
                [ds.pressure, ds.rh],
                [ds.air_temperature, ds.CTD_SST],
            ]
            var_labels = [
                [
                    r"$\overline{\mathrm{U}}$ Cup",
                    r"$\overline{\mathrm{\theta}}$ Cup"
                ],
                ["Pressure", "Relative Humidity"],
                ["Air Temperature", "Sea Surface Temperature"],
            ]
            ax_labels = [
                [
                    r"$\overline{\mathrm{U}}$ (ms$^{-1}$)",
                    r"$\bar{\mathrm{\theta}}$ (degrees)",
                ],
                [
                    r"$\overline{\mathrm{P}}$ (bar)",
                    r"$\overline{\mathrm{RH}}$ (%)"
                ],
                [
                    r"$\overline{\mathrm{T}}_{air}$ ($\degree$C)",
                    r"$\overline{\mathrm{SST}}$ ($\degree$C)",
                ],
            ]

            # Create figure and axes objects
            fig, axs = plt.subplots(nrows=3,
                                    figsize=(14, 8),
                                    constrained_layout=True)
            twins = [ax.twinx() for ax in axs]
            fig.suptitle(
                f"Surface Met Parameters at {ds.attrs['location_meaning']} on {date}"
            )

            # Create the plots
            gill_data = [ds.gill_wind_speed, ds.gill_wind_direction]
            gill_labels = [
                r"$\overline{\mathrm{U}}$ Gill",
                r"$\overline{\mathrm{\theta}}$ Gill",
            ]
            double_plot(
                axs[0],
                twins[0],
                data=gill_data,
                colors=colors,
                var_labels=gill_labels,
                linestyle="--",
                ax_labels=["", ""],
            )
            for i in range(3):
                double_plot(
                    axs[i],
                    twins[i],
                    data=data[i],
                    colors=colors,
                    var_labels=var_labels[i],
                    ax_labels=ax_labels[i],
                )
                axs[i].grid(which="both", color="lightgray", linewidth=0.5)
                lines = axs[i].lines + twins[i].lines
                labels = [line.get_label() for line in lines]
                axs[i].legend(lines,
                              labels,
                              ncol=len(labels),
                              bbox_to_anchor=(1, -0.15))
                format_time_xticks(axs[i])
                axs[i].set_xlabel("Time (UTC)")
            twins[0].set_ylim(0, 360)

            # Save and close the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        # Create the second plot -- Conductivity and Sea Surface Temperature
        filename = DSUtil.get_plot_filename(dataset, "conductivity", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Define data and metadata
            data = [ds.conductivity, ds.CTD_SST]
            var_labels = [
                r"Conductivity (S m$^{-1}$)",
                r"$\overline{\mathrm{SST}}$ ($\degree$C)",
            ]
            ax_labels = [
                r"Conductivity (S m$^{-1}$)",
                r"$\overline{\mathrm{SST}}$ ($\degree$C)",
            ]

            # Create the figure and axes objects
            fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True)
            fig.suptitle(
                f"Conductivity and Sea Surface Temperature at {ds.attrs['location_meaning']} on {date}"
            )
            twin = ax.twinx()

            # Make the plot
            double_plot(
                ax,
                twin,
                data=data,
                colors=colors,
                var_labels=var_labels,
                ax_labels=ax_labels,
            )

            # Set the labels and ticks
            ax.grid(which="both", color="lightgray", linewidth=0.5)
            lines = ax.lines + twin.lines
            labels = [line.get_label() for line in lines]
            ax.legend(lines,
                      labels,
                      ncol=len(labels),
                      bbox_to_anchor=(1, -0.03))
            format_time_xticks(ax)
            ax.set_xlabel("Time (UTC)")

            # Save and close the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        # Create the third plot - current speed and direction
        filename = DSUtil.get_plot_filename(dataset, "current_velocity", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Reduce dimensionality of dataset for plotting
            ds_1H: xr.Dataset = ds.reindex({"depth": ds.depth.data[::2]})
            ds_1H: xr.Dataset = ds_1H.resample(time="1H").nearest()

            # Calculations for contour plots
            levels = 30

            # Calculations for quiver plot
            qv_slice = slice(
                1,
                None)  # Skip first to prevent weird overlap with axes borders
            qv_degrees = ds_1H.current_direction.data[qv_slice,
                                                      qv_slice].transpose()
            qv_theta = (qv_degrees + 90) * (np.pi / 180)
            X, Y = ds_1H.time.data[qv_slice], ds_1H.depth.data[qv_slice]
            U, V = np.cos(-qv_theta), np.sin(-qv_theta)

            # Create figure and axes objects
            fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True)
            fig.suptitle(
                f"Current Speed and Direction at {ds.attrs['location_meaning']} on {date}"
            )

            # Make the plots
            csf = ds.current_speed.plot.contourf(
                ax=ax,
                x="time",
                yincrease=False,
                levels=levels,
                cmap=cmocean.cm.deep_r,
                add_colorbar=False,
            )
            # ds.current_speed.plot.contour(ax=ax, x="time", yincrease=False, levels=levels, colors="lightgray", linewidths=0.5)
            ax.quiver(
                X,
                Y,
                U,
                V,
                width=0.002,
                scale=60,
                color="white",
                pivot="middle",
                zorder=10,
            )
            add_colorbar(ax, csf, r"Current Speed (mm s$^{-1}$)")

            # Set the labels and ticks
            format_time_xticks(ax)
            ax.set_xlabel("Time (UTC)")
            ax.set_ylabel("Depth (m)")

            # Save the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        return
Exemplo n.º 26
0
    def hook_generate_and_persist_plots(self, dataset: xr.Dataset):
        def format_time_xticks(ax,
                               start=4,
                               stop=21,
                               step=4,
                               date_format="%H-%M"):
            ax.xaxis.set_major_locator(
                mpl.dates.HourLocator(byhour=range(start, stop, step)))
            ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(date_format))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha="center")

        def add_colorbar(ax, plot, label):
            cb = plt.colorbar(plot, ax=ax, pad=0.01)
            cb.ax.set_ylabel(label, fontsize=12)
            cb.outline.set_linewidth(1)
            cb.ax.tick_params(size=0)
            cb.ax.minorticks_off()
            return cb

        ds = dataset
        date = pd.to_datetime(ds.time.data[0]).strftime("%d-%b-%Y")

        # Colormaps to use
        wind_cmap = cmocean.cm.deep_r
        avail_cmap = cmocean.cm.amp_r

        # Create the first plot - Lidar Wind Speeds at several elevations
        filename = DSUtil.get_plot_filename(dataset, "wind_speeds", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Create the figure and axes objects
            fig, ax = plt.subplots(nrows=1,
                                   ncols=1,
                                   figsize=(14, 8),
                                   constrained_layout=True)
            fig.suptitle(
                f"Wind Speed Time Series at {ds.attrs['location_meaning']} on {date}"
            )

            # Select heights to plot
            heights = [40, 90, 140, 200]

            # Plot the data
            for i, height in enumerate(heights):
                velocity = ds.wind_speed.sel(height=height)
                velocity.plot(
                    ax=ax,
                    linewidth=2,
                    c=wind_cmap(i / len(heights)),
                    label=f"{height} m",
                )

            # Set the labels and ticks
            format_time_xticks(ax)
            ax.legend(facecolor="white",
                      ncol=len(heights),
                      bbox_to_anchor=(1, -0.05))
            ax.set_title("")  # Remove bogus title created by xarray
            ax.set_xlabel("Time (UTC)")
            ax.set_ylabel(r"Wind Speed (ms$^{-1}$)")

            # Save the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        filename = DSUtil.get_plot_filename(dataset,
                                            "wind_speed_and_direction", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            # Reduce dimensionality of dataset for plotting
            ds_1H: xr.Dataset = ds.resample(time="1H").nearest()

            # Calculations for contour plots
            levels = 30

            # Calculations for quiver plot
            qv_slice = slice(
                1,
                None)  # Skip first to prevent weird overlap with axes borders
            qv_degrees = ds_1H.wind_direction.data[qv_slice].transpose()
            qv_theta = (qv_degrees + 90) * (np.pi / 180)
            X, Y = ds_1H.time.data[qv_slice], ds_1H.height.data
            U, V = np.cos(-qv_theta), np.sin(-qv_theta)

            # Create figure and axes objects
            fig, axs = plt.subplots(nrows=2,
                                    figsize=(14, 8),
                                    constrained_layout=True)
            fig.suptitle(
                f"Wind Speed and Direction at {ds.attrs['location_meaning']} on {date}"
            )

            # Make top subplot -- contours and quiver plots for wind speed and direction
            csf = ds.wind_speed.plot.contourf(ax=axs[0],
                                              x="time",
                                              levels=levels,
                                              cmap=wind_cmap,
                                              add_colorbar=False)
            # ds.wind_speed.plot.contour(ax=axs[0], x="time", levels=levels, colors="lightgray", linewidths=0.5)
            axs[0].quiver(
                X,
                Y,
                U,
                V,
                width=0.002,
                scale=60,
                color="white",
                pivot="middle",
                zorder=10,
            )
            add_colorbar(axs[0], csf, r"Wind Speed (ms$^{-1}$)")

            # Make bottom subplot -- heatmap for data availability
            da = ds.data_availability.plot(
                ax=axs[1],
                x="time",
                cmap=avail_cmap,
                add_colorbar=False,
                vmin=0,
                vmax=100,
            )
            add_colorbar(axs[1], da, "Availability (%)")

            # Set the labels and ticks
            for i in range(2):
                format_time_xticks(axs[i])
                axs[i].set_xlabel("Time (UTC)")
                axs[i].set_ylabel("Height ASL (m)")

            # Save the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        return
Exemplo n.º 27
0
def test_end_time_is_correct(raw_dataset, dataset, config):
    expected = ("20211001", "000002")
    time_definition = config.dataset_definition.get_variable("time")
    assert DSUtil.get_raw_end_time(raw_dataset, time_definition) == expected
    assert DSUtil.get_end_time(dataset) == expected
Exemplo n.º 28
0
def test_datastream_name_retrieved_from_config(dataset, config):
    datastream_name = dataset.attrs.get("datastream_name")
    assert DSUtil.get_datastream_name(config=config) == datastream_name
Exemplo n.º 29
0
def test_datetime64_is_converted_to_string():
    time_str = "2020-01-01 00:00:00"
    datetime64 = np.datetime64(time_str)
    assert DSUtil.datetime64_to_string(datetime64) == ("20200101", "000000")
Exemplo n.º 30
0
    def hook_generate_and_persist_plots(self, dataset: xr.Dataset) -> None:
        """-------------------------------------------------------------------
        Hook to allow users to create plots from the xarray dataset after
        processing and QC have been applied and just before the dataset is
        saved to disk.

        To save on filesystem space (which is limited when running on the
        cloud via a lambda function), this method should only
        write one plot to local storage at a time. An example of how this
        could be done is below:

        ```
        filename = DSUtil.get_plot_filename(dataset, "sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            ax.plot(dataset["time"].data, dataset["sea_level"].data)
            fig.savefig(tmp_path)
            self.storage.save(tmp_path)
            plt.close()

        filename = DSUtil.get_plot_filename(dataset, "qc_sea_level", "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:
            fig, ax = plt.subplots(figsize=(10,5))
            DSUtil.plot_qc(dataset, "sea_level", tmp_path)
            storage.save(tmp_path)
        ```

        Args:
            dataset (xr.Dataset):   The xarray dataset with customizations and
                                    QC applied.
        -------------------------------------------------------------------"""
        ds = dataset

        # Useful values
        location = ds.attrs["location_meaning"]
        date1, date2 = pd.to_datetime(ds.time.data[0]), pd.to_datetime(
            ds.time.data[-1])
        hhmm1, hhmm2 = date1.strftime("%H:%M"), date2.strftime("%H:%M")
        date = date1.strftime("%d-%b-%Y")

        filename = DSUtil.get_plot_filename(dataset, "buoy_motion_histogram",
                                            "png")
        with self.storage._tmp.get_temp_filepath(filename) as tmp_path:

            fig, ax = plt.subplots(figsize=(14, 8), constrained_layout=True)

            # Create plot labels including mean roll/pitch
            mean_roll, mean_pitch = ds["roll"].mean().data, ds["pitch"].mean(
            ).data
            roll_label = (r"$\.{\theta}_{roll}$ [$\overline{\theta}_r$ =" +
                          f"{mean_roll:.3f} deg]")
            pitch_label = (r"$\.{\theta}_{pitch}$ [$\overline{\theta}_p$ =" +
                           f"{mean_pitch:.3f} deg]")

            # Plot the stepped
            ds["roll"].plot.hist(ax=ax,
                                 linewidth=2,
                                 edgecolor="black",
                                 histtype="step",
                                 label=roll_label)
            ds["pitch"].plot.hist(ax=ax,
                                  linewidth=2,
                                  edgecolor="red",
                                  histtype="step",
                                  label=pitch_label)

            # Set axes and figure labels
            fig.suptitle(
                f"Buoy Motion Histogram at {location} on {date} from {hhmm1} to {hhmm2}"
            )
            ax.set_xlabel("Buoy Motion (deg)")
            ax.set_ylabel("Frequency")
            ax.set_title("")
            ax.legend(ncol=2, bbox_to_anchor=(1, -0.04))

            # Save the figure
            fig.savefig(tmp_path, dpi=100)
            self.storage.save(tmp_path)
            plt.close()

        return