Пример #1
0
def get_aux_dset_slicing(dim_names, last_ind=None, is_spectroscopic=False):
    """
    Returns a dictionary of slice objects to help in creating region references in the position or spectroscopic
    indices and values datasets

    Parameters
    ------------
    dim_names : iterable
        List of strings denoting the names of the position axes or spectroscopic dimensions arranged in the same order
        that matches the dimensions in the indices / values dataset
    last_ind : (Optional) unsigned int, default = None
        Last pixel in the positon or spectroscopic matrix. Useful in experiments where the
        parameters have changed (eg. BEPS new data format) during the experiment.
    is_spectroscopic : bool, optional. default = True
        set to True for position datasets and False for spectroscopic datasets
    Returns
    ------------
    slice_dict : dictionary
        Dictionary of tuples containing slice objects corresponding to
        each position axis.
    """
    dim_names = validate_list_of_strings(dim_names, 'dim_names')
    if len(dim_names) == 0:
        raise ValueError('No valid dim_names provided')

    slice_dict = dict()
    for spat_ind, curr_dim_name in enumerate(dim_names):
        val = (slice(last_ind), slice(spat_ind, spat_ind + 1))
        if is_spectroscopic:
            val = val[::-1]
        slice_dict[str(curr_dim_name)] = val
    return slice_dict
Пример #2
0
    def can_read(self, *args, **kwargs):
        """
        Checks whether the provided file can be read by this reader.

        This basic function compares the file extension against the
        ``extension`` keyword argument. If the extension matches, this function
        returns True

        Parameters
        ----------
        extension : str or iterable of str, Optional. Default = None
            File extension for the input file.

        Returns
        -------
        file_path : str
            Path to the file that needs to be provided to read()
            if the provided file was indeed a valid file
            Else, None

        Raises
        ------
        NotImplementedError : if this function is called for this or a child
        class that does not provide the ``extension`` keyword argument

        Notes
        -----
        It is recommended to add additional checks as necessary to ensure that
        the translator can indeed read the given file such as by validating the
        headers or similar metadata.
        """
        targ_ext = kwargs.get('extension', None)
        if not targ_ext:
            raise NotImplementedError('Either can_read() has not been '
                                      'implemented by this Reader or the '
                                      '"extension" keyword argument was '
                                      'missing')
        if isinstance(targ_ext, (str, unicode)):
            targ_ext = [targ_ext]
        targ_ext = validate_list_of_strings(targ_ext,
                                            parm_name='(keyword argument) '
                                                      '"extension"')

        # Get rid of any '.' separators that may be in the list of extensions
        # Also turn to lower case for case insensitive comparisons
        targ_ext = [item.replace('.', '').lower() for item in targ_ext]

        file_path = os.path.abspath(self._input_file_path)
        extension = os.path.splitext(file_path)[1][1:]

        # Ensure extension is lower case just like targets above
        extension = extension.lower()

        if extension in targ_ext:
            return file_path
        else:
            return None
Пример #3
0
    def is_valid_file(file_path, *args, **kwargs):
        """
        Checks whether the provided file can be read by this translator.

        This basic function compares the file extension against the "extension"
        keyword argument. If the extension matches, this function returns True

        Parameters
        ----------
        file_path : str
            Path to raw data file

        Returns
        -------
        file_path : str
            Path to the file that needs to be provided to translate()
            if the provided file was indeed a valid file
            Else, None
        """
        file_path = validate_single_string_arg(file_path, 'file_name')

        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path + ' does not exist')

        targ_ext = kwargs.get('extension', None)
        if not targ_ext:
            raise NotImplementedError('Either is_valid_file() has not been '
                                      'implemented by this translator or the '
                                      '"extension" keyword argument was '
                                      'missing')
        if isinstance(targ_ext, (str, unicode)):
            targ_ext = [targ_ext]
        targ_ext = validate_list_of_strings(targ_ext,
                                            parm_name='(keyword argument) '
                                            '"extension"')

        # Get rid of any '.' separators that may be in the list of extensions
        # Also turn to lower case for case insensitive comparisons
        targ_ext = [item.replace('.', '').lower() for item in targ_ext]

        file_path = os.path.abspath(file_path)
        extension = os.path.splitext(file_path)[1][1:]

        # Ensure extension is lower case just like targets above
        extension = extension.lower()

        if extension in targ_ext:
            return file_path
        else:
            return None
Пример #4
0
def get_attributes(h5_object, attr_names=None, strict=False):
    """
    Returns attribute associated with some DataSet.

    Parameters
    ----------
    h5_object : :class:`h5py.Dataset`
        Dataset object reference.
    attr_names : str or :class:`list` of str, optional. Default = all
        Name of attribute object to return.
    strict : bool, optional. Default = False
        If True - raises a KeyError if desired keys are not found.
        Else, raises warning instead.
        This is especially useful when attempting to read attributes with
        invalid names such as spaces on either sides of text.

    Returns
    -------
    att_dict : dict
        Dictionary containing (name,value) pairs of attributes

    """
    if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)):
        raise TypeError(
            'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object'
        )

    if attr_names is None:
        attr_names = h5_object.attrs.keys()
    else:
        attr_names = validate_list_of_strings(attr_names, 'attr_names')
        # Set strict to True since user is looking for specific attributes
        strict = True

    att_dict = {}

    for attr in attr_names:
        try:
            att_dict[attr] = get_attr(h5_object, attr)
        except KeyError:
            mesg = '"{}" is not an attribute of {}'.format(
                attr, h5_object.name)
            if strict:
                raise KeyError(mesg)
            else:
                warn(mesg)

    return att_dict
Пример #5
0
def get_auxiliary_datasets(h5_object, aux_dset_name=None):
    """
    Returns auxiliary dataset objects associated with some DataSet through its attributes.
    Note - region references will be ignored.

    Parameters
    ----------
    h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File`
        Dataset object reference.
    aux_dset_name : str or :class:`list` of str, optional. Default = all
        Name of auxiliary :class:`h5py.Dataset` objects to return.

    Returns
    -------
    list of :class:`h5py.Reference` of auxiliary :class:`h5py.Dataset` objects.

    """
    if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)):
        raise TypeError(
            'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object'
        )

    if aux_dset_name is None:
        aux_dset_name = h5_object.attrs.keys()
    else:
        aux_dset_name = validate_list_of_strings(aux_dset_name,
                                                 'aux_dset_name')

    data_list = list()
    curr_name = None
    try:
        h5_file = h5_object.file
        for curr_name in aux_dset_name:
            h5_ref = h5_object.attrs[curr_name]
            if isinstance(h5_ref, h5py.Reference) and isinstance(h5_file[h5_ref], h5py.Dataset) and not \
                    isinstance(h5_ref, h5py.RegionReference):
                data_list.append(h5_file[h5_ref])
    except KeyError:
        raise KeyError('%s is not an attribute of %s' %
                       (str(curr_name), h5_object.name))

    return data_list
Пример #6
0
def get_h5_obj_refs(obj_names, h5_refs):
    """
    Given a list of H5 references and a list of names,
    this method returns H5 objects corresponding to the names

    Parameters
    ----------
    obj_names : string or List of strings
        names of target h5py objects
    h5_refs : H5 object reference or List of H5 object references
        list containing the target reference

    Returns
    -------
    found_objects : List of HDF5 dataset references
        Corresponding references

    """
    obj_names = validate_list_of_strings(obj_names, 'attr_names')

    if isinstance(h5_refs, (h5py.File, h5py.Group, h5py.Dataset)):
        h5_refs = [h5_refs]
    if not isinstance(h5_refs, (list, tuple)):
        raise TypeError(
            'h5_refs should be a / list of h5py.Dataset, h5py.Group or h5py.File object(s)'
        )

    found_objects = []
    for target_name in obj_names:
        for h5_object in h5_refs:
            if not isinstance(h5_object,
                              (h5py.File, h5py.Group, h5py.Dataset)):
                continue
            if h5_object.name.split('/')[-1] == target_name:
                found_objects.append(h5_object)

    return found_objects
Пример #7
0
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None,
                            verbose=False):
    """
    Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or h5py.File
        Group under which the indices and values datasets will be created
    h5_inds : HDF5 Dataset
            Spectroscopic or Positions indices dataset
    h5_vals : HDF5 Dataset
            Spectroscopic or Positions values dataset
    dim_name : str or unicode or list of strings
            Names of the dimension(s) to remove
    basename : str or unicode, Optional
            String to which '_Indices' and '_Values' will be appended to get the names of the new datasets.
            Default = 'Position' or 'Spectroscopic'
    is_spec : bool, optional
            Whether or not the provided ancillary datasets are position or spectroscopic
            The user is recommended to supply this parameter whenever it is known or possible.
            By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional. Default = False
            Whether or not to print debugging print statements

    Returns
    -------
    h5_inds_new : h5py.Dataset
            Reduced indices dataset
    h5_vals_new : h5py.Dataset
            Reduces values dataset

    """
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should either be a h5py. Group or File object')

    for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']):
        if not isinstance(param, h5py.Dataset):
            raise TypeError(param_name + ' should be a h5py.Dataset object')
    if dim_name is not None:
        dim_name = validate_list_of_strings(dim_name, 'dim_name')

    all_dim_names = list(get_attr(h5_inds, 'labels'))
    for item in dim_name:
        if item not in all_dim_names:
            raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names))

    ind_mat = h5_inds[()]
    val_mat = h5_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if ind_mat.shape[0] == ind_mat.shape[1]:
            raise ValueError('Unable automatically guess whether the provided datasets are position or '
                             'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg')
        if ind_mat.shape[0] < ind_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec)))

    if basename is not None:
        basename = validate_single_string_arg(basename, 'basename')
        if basename.endswith('_'):
            basename = basename[:-1]
    else:
        if is_spec:
            basename = 'Spectroscopic'
        else:
            basename = 'Position'

    for sub_name in ['_Indices', '_Values']:
        if basename + sub_name in h5_parent_group.keys():
            raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name,
                                                                                     h5_parent_group.name))

    if set(dim_name) != set(all_dim_names):
        # At least one dimension will remain

        if verbose:
            print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name))

        if not is_spec:
            # Convert to spectral shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        # For all dimensions, find where the index = 0
        # basically, we are indexing all dimensions to 0
        first_indices = []
        keep_dim = np.ones(len(all_dim_names), dtype=bool)
        for cur_dim in dim_name:
            dim_ind = all_dim_names.index(cur_dim)
            keep_dim[dim_ind] = False
            # check equality against the minimum value instead of 0 to account for cases when a dimension does not start
            # from 0 (already been sliced) - think of multi-dimensional slicing!
            first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind]))
        first_indices = np.vstack(first_indices)

        if verbose:
            print('Raw first_indices:')
            print(first_indices)
            print('Dimensions to keep: {}'.format(keep_dim))

        step_starts = np.all(first_indices, axis=0)

        if verbose:
            print('Columns in dataset to keep:')
            print(step_starts)

        '''
        Extract all rows that we want to keep from input indices and values
        '''
        # TODO: handle TypeError: Indexing elements must be in increasing order
        ind_mat = ind_mat[keep_dim, :][:, step_starts]
        val_mat = val_mat[keep_dim, :][:, step_starts]

        if not is_spec:
            # Convert back to position shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        '''
        Create new Datasets to hold the data
        Name them based on basename
        '''
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype)
        # Extracting the labels from the original spectroscopic data sets
        labels = h5_inds.attrs['labels'][keep_dim]
        # Creating the dimension slices for the new spectroscopic data sets

        # Adding the labels and units to the new spectroscopic data sets
        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]})

    else:
        # Remove all dimensions:
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE)

        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']})

    return h5_inds_new, h5_vals_new
Пример #8
0
def get_unit_values(ds_inds,
                    ds_vals,
                    dim_names=None,
                    all_dim_names=None,
                    is_spec=None,
                    verbose=False):
    """
    Gets the unit arrays of values that describe the spectroscopic dimensions

    Parameters
    ----------
    ds_inds : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Indices dataset
    ds_vals : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Values dataset
    dim_names : str, or list of str, Optional
        Names of the dimensions of interest. Default = all
    all_dim_names : list of str, Optional
        Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset
        objects for h5_inds, h5_vals since there is no other way of getting the dimension names.
    is_spec : bool, optional
        Whether or not the provided ancillary datasets are position or spectroscopic
        The user is recommended to supply this parameter whenever it is known
        By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional
        Whether or not to print debugging statements. Default - off

    Note - this function can be extended / modified for ancillary position dimensions as well

    Returns
    -------
    unit_values : dict
        Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.

    """
    if all_dim_names is None:
        allowed_types = h5py.Dataset
    else:
        all_dim_names = validate_list_of_strings(all_dim_names,
                                                 'all_dim_names')
        all_dim_names = np.array(all_dim_names)
        allowed_types = (h5py.Dataset, np.ndarray)

    for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']):
        if not isinstance(dset, allowed_types):
            raise TypeError(dset_name +
                            ' should be of type: {}'.format(allowed_types))

    # For now, we will throw an error if even a single dimension is listed as an incomplete dimension:
    if isinstance(ds_inds, h5py.Dataset):
        if np.any([
                'incomplete_dimensions' in dset.attrs.keys()
                for dset in [ds_inds, ds_vals]
        ]):
            try:
                incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_inds = None
            try:
                incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_vals = None
            if incomp_dims_inds is None and incomp_dims_vals is not None:
                incomp_dims = incomp_dims_vals
            elif incomp_dims_inds is not None and incomp_dims_vals is None:
                incomp_dims = incomp_dims_inds
            else:
                # ensure that both attributes are the same
                if incomp_dims_vals != incomp_dims_inds:
                    raise ValueError(
                        'Provided indices ({}) and values ({}) datasets were marked with different values '
                        'for incomplete_datasets.'.format(
                            incomp_dims_inds, incomp_dims_vals))
                incomp_dims = incomp_dims_vals

            all_dim_names = get_attr(ds_inds, 'labels')
            raise ValueError(
                'Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}'
                '. You are recommended to find unit values manually'.format(
                    all_dim_names, incomp_dims))

    # Do we need to check that the provided inds and vals correspond to the same main dataset?
    if ds_inds.shape != ds_vals.shape:
        raise ValueError(
            'h5_inds: {} and h5_vals: {} should have the same shapes'.format(
                ds_inds.shape, ds_vals.shape))

    if all_dim_names is None:
        all_dim_names = get_attr(ds_inds, 'labels')
    if verbose:
        print('All dimensions: {}'.format(all_dim_names))

    # First load to memory
    inds_mat = ds_inds[()]
    vals_mat = ds_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if inds_mat.shape[0] < inds_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError(
                'is_spec should be a boolean. Provided object is of type: {}'.
                format(type(is_spec)))

    if verbose:
        print(
            'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}'
            .format(inds_mat.shape, is_spec))

    if not is_spec:
        # Convert to spectral shape
        inds_mat = np.transpose(inds_mat)
        vals_mat = np.transpose(vals_mat)

    if len(all_dim_names) != inds_mat.shape[0]:
        raise ValueError(
            'Length of dimension names list: {} not matching with shape of dataset: {}'
            '.'.format(len(all_dim_names), inds_mat.shape[0]))

    if dim_names is None:
        dim_names = all_dim_names
        if verbose:
            print('Going to return unit values for all dimensions: {}'.format(
                all_dim_names))
    else:
        dim_names = validate_list_of_strings(dim_names, 'dim_names')

        if verbose:
            print(
                'Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}'
                '.'.format(dim_names, all_dim_names))

        # check to make sure that the dimension names exist in the datasets:
        for dim_name in dim_names:
            if dim_name not in all_dim_names:
                raise KeyError(
                    'Dimension {} does not exist in the provided ancillary datasets'
                    .format(dim_name))

    unit_values = dict()
    for dim_name in all_dim_names:
        # Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
        if verbose:
            print('Looking for dimension: {} in {}'.format(
                dim_name, dim_names))
        desired_row_ind = np.where(all_dim_names == dim_name)[0][0]

        inds_for_dim = inds_mat[desired_row_ind]
        # Wherever this dimension goes to 0 - start of a new tile
        starts = np.where(inds_for_dim == np.min(inds_for_dim))[0]
        if starts[0] != 0:
            raise ValueError('Spectroscopic Indices for dimension: "{}" not '
                             'starting with 0. Please fix this and try again'
                             '.'.format(dim_name))

        # There may be repetitions in addition to tiling. Find how the the positions increase.
        # 1 = repetition, > 1 = new tile
        step_sizes = np.hstack(([1], np.diff(starts)))
        # This array is of the same length as the full indices array

        # We should expect only two values of step sizes for a regular dimension (tiles of the same size):
        # 1 for same value repeating and a big jump in indices when the next tile starts
        # If the repeats / tiles are of different lengths, then this is not a regular dimension.
        # What does a Unit Values vector even mean in this case? Just raise an error for now
        if np.where(np.unique(step_sizes) - 1)[0].size > 1:
            raise ValueError('Non constant step sizes')

        # Finding Start of a new tile
        tile_starts = np.where(step_sizes > 1)[0]

        # converting these indices to correct indices that can be mapped straight to
        if len(tile_starts) < 1:
            # Dimension(s) with no tiling at all
            # Make it look as though the next tile starts at the end of the whole indices vector
            tile_starts = np.array([0, len(inds_for_dim)])
        else:
            # Dimension with some form of repetition
            tile_starts = np.hstack(([0], starts[tile_starts]))

            # Verify that each tile is identical here
            # Last tile will not be checked unless we add the length of the indices vector as the start of next tile
            tile_starts = np.hstack((tile_starts, [len(inds_for_dim)]))
            subsections = [
                inds_for_dim[tile_starts[ind]:tile_starts[ind + 1]]
                for ind in range(len(tile_starts) - 1)
            ]
            if np.max(np.diff(subsections, axis=0)) != 0:
                # Should get unit values for ALL dimensions regardless of expectations to catch such scenarios.
                raise ValueError(
                    'Values in each tile of dimension: {} are different'.
                    format(dim_name))

        # Now looking within the first tile:
        subsection = inds_for_dim[tile_starts[0]:tile_starts[1]]
        # remove all repetitions. ie - take indices only where jump == 1
        step_inds = np.hstack(
            ([0], np.where(np.hstack(([0], np.diff(subsection))))[0]))
        # Finally, use these indices to get the values
        if dim_name in dim_names:
            # Only add this dimension to dictionary if requwested.
            unit_values[dim_name] = vals_mat[desired_row_ind, step_inds]

    return unit_values