Exemplo n.º 1
0
def write_book_keeping_attrs(h5_obj):
    """
    Writes basic book-keeping and posterity related attributes to groups created in pyNSID such as machine id,
    pyNSID version, timestamp.

    Parameters
    ----------
    h5_obj : :class:`h5py.Dataset`, :class:`h5py.Group`, or :class:`h5py.File`
        Object to which basic book-keeping attributes need to be written

    """
    hut.write_book_keeping_attrs(h5_obj)
    hut.write_simple_attrs(h5_obj, {'pyNSID_version': py_nsid_version})
Exemplo n.º 2
0
def write_pynsid_book_keeping_attrs(h5_object):
    """
    Writes book-keeping information to the HDF5 object

    Parameters
    ----------
    h5_object

    Returns
    -------

    """
    write_book_keeping_attrs(h5_object)
    write_simple_attrs(h5_object, {'pyNSID_version': pynsid_version})
Exemplo n.º 3
0
def create_indexed_group(h5_parent_group, base_name):
    """
    Creates a group with an indexed name (eg - 'Measurement_012') under
    ``h5_parent_group`` using the provided ``base_name`` as a prefix for the
    group's name

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or :class:`h5py.File`
        File or group within which the new group will be created
    base_name : str or unicode
        Prefix for the group name. This need not end with a '_'. It will be
        added automatically
    """
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or Group object')
    base_name = validate_single_string_arg(base_name, 'base_name')

    group_name = assign_group_index(h5_parent_group, base_name)
    h5_new_group = h5_parent_group.create_group(group_name)
    write_book_keeping_attrs(h5_new_group)
    return h5_new_group
Exemplo n.º 4
0
def write_nsid_dataset(dataset,
                       h5_group,
                       main_data_name='',
                       verbose=False,
                       **kwargs):
    """
    Writes the provided sid dataset as a 'Main' dataset with all appropriate
    linking.

    Parameters
    ----------
    dataset : sidpy.Dataset
        Dataset to be written to HDF5 in NSID format
    h5_group : class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character
        Use this to provide better context about the dataset in the HDF5 file
    verbose : bool, Optional. Default = False
        Whether or not to write logs to standard out
    kwargs: dict
        additional keyword arguments passed on to h5py when writing data

    Return
    ------
    h5py dataset
    """
    if not isinstance(dataset, Dataset):
        raise TypeError('data to write should be sidpy Dataset')
    if not isinstance(h5_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or h5py.Group '
                        'object')
    if not isinstance(main_data_name, str):
        raise TypeError('main_data_name should be a string, but it instead  it'
                        ' is {}'.format(type(main_data_name)))

    if not is_editable_h5(h5_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    if not isinstance(main_data_name, str):
        raise TypeError('main_data_name must be a string')

    if main_data_name == '':
        if dataset.title.strip() == '':
            main_data_name = 'nDim_Data'
        else:
            main_data_name = dataset.title.split('/')[-1]

    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn('main_data_name should not contain the "-" character. Reformatted'
             ' name from:{} to '
             '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    h5_group = h5_group.create_group(main_data_name)

    write_book_keeping_attrs(h5_group)
    write_pynsid_book_keeping_attrs(h5_group)

    #####################
    # Write Main Dataset
    ####################
    if h5_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn('This HDF5 file has been opened wth the "mpio" communicator. '
                 'mpi4py does not allow creation of compressed datasets. '
                 'Compression kwarg has been removed')

    if main_data_name in h5_group:
        raise ValueError('h5 dataset of that name already exists, choose '
                         'different name or delete first')

    _ = kwargs.pop('dtype', None)

    # step 1 - create the empty dataset:
    h5_main = h5_group.create_dataset(main_data_name,
                                      shape=dataset.shape,
                                      dtype=dataset.dtype,
                                      **kwargs)
    if verbose:
        print('Created empty dataset: {} for writing Dask dataset: {}'
              ''.format(h5_main, dataset))
        print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
              ''.format(h5_main.name, h5_main.file.filename))
    # Step 2 - now ask Dask to dump data to disk
    da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset})

    if verbose:
        print('Created dataset for Main')

    #################
    # Add Dimensions
    #################
    dimensional_dict = {}

    for i, this_dim in dataset._axes.items():
        if not isinstance(this_dim, Dimension):
            raise ValueError('Dimensions {} is not a sidpy Dimension')

        this_dim_dset = h5_group.create_dataset(this_dim.name,
                                                data=this_dim.values)
        attrs_to_write = {
            'name': this_dim.name,
            'units': this_dim.units,
            'quantity': this_dim.quantity,
            'dimension_type': this_dim.dimension_type.name
        }

        write_simple_attrs(this_dim_dset, attrs_to_write)
        dimensional_dict[i] = this_dim_dset

    attrs_to_write = {
        'quantity': dataset.quantity,
        'units': dataset.units,
        'main_data_name': dataset.title,
        'data_type': dataset.data_type.name,
        'modality': dataset.modality,
        'source': dataset.source
    }

    write_simple_attrs(h5_main, attrs_to_write)
    write_pynsid_book_keeping_attrs(h5_main)

    for attr_name in dir(dataset):
        attr_val = getattr(dataset, attr_name)
        if isinstance(attr_val, dict):
            if verbose:
                print('Writing attributes from property: {} of the '
                      'sidpy.Dataset'.format(attr_name))
            write_dict_to_h5_group(h5_group, attr_val, attr_name)

    # This will attach the dimensions
    nsid_data_main = link_as_main(h5_main, dimensional_dict)

    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    dataset.h5_dataset = nsid_data_main

    return nsid_data_main
Exemplo n.º 5
0
def write_results(h5_group, dataset=None, attributes=None, process_name=None):
    """
    Writes results of a processing step back to HDF5 in NSID format

    Parameters
    ----------
    h5_group : h5py.Group
        HDF5 Group into which results will be written
    dataset : sidpy.Dataset, optional. Default = None
        Dataset ??
    attributes : dict, optional. Default = None
        Metadata regarding processing step
    process_name : str, optional. Default = "Log_"
        Name of the prefix for group containing process results

    Returns
    -------
    log_group : h5py.Group
        HDF5 group containing results
    """

    found_valid_dataset = False

    if dataset is not None:

        if isinstance(dataset, Dataset):
            dataset = [dataset]

        if isinstance(dataset, list):
            if not all([isinstance(itm, Dataset) for itm in dataset]):
                raise TypeError('List contains non-Sidpy dataset entries! '
                                'Should only contain sidpy datasets')

            found_valid_dataset = True

    found_valid_attributes = False

    if attributes is not None:
        if isinstance(attributes, dict):
            if len(attributes) > 0:
                found_valid_attributes = True
        else:
            raise TypeError("Provided attributes is type {} but should be type"
                            " dict".format(type(attributes)))

    if not (found_valid_dataset or found_valid_attributes):
        raise ValueError('results should contain at least a sidpy Dataset or '
                         'a dictionary in results')
    log_name = 'Log_'
    if process_name is not None:
        log_name = log_name + process_name

    log_group = create_indexed_group(h5_group, log_name)
    write_book_keeping_attrs(log_group)
    write_pynsid_book_keeping_attrs(log_group)

    if found_valid_dataset:
        for dset in dataset:
            write_nsid_dataset(dset, log_group)

        if found_valid_attributes:
            write_simple_attrs(log_group, flatten_dict(attributes))

    return log_group
Exemplo n.º 6
0
def create_results_group(h5_main, tool_name, h5_parent_group=None):
    """
    Creates a h5py.Group object auto-indexed and named as
    'DatasetName-ToolName_00x'

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the dataset based on which the process / analysis is being
        performed
    tool_name : string / unicode
        Name of the Process / Analysis applied to h5_main
    h5_parent_group : h5py.Group, optional. Default = None
        Parent group under which the results group will be created. Use this
        option to write results into a new HDF5 file. By default, results will
        be written into the same group containing `h5_main`

    Returns
    -------
    h5_group : :class:`h5py.Group`
        Results group which can now house the results datasets
    """
    # TODO: Revise significantly. Avoid parent dataset name
    # Consider embedding refs to source datasets as attributes of group

    warn(
        'The behavior of create_results_group is very likely to change soon '
        'and significantly. Use this function with caution', FutureWarning)

    if not isinstance(h5_main, h5py.Dataset):
        raise TypeError('h5_main should be a h5py.Dataset object')
    if h5_parent_group is not None:
        if not isinstance(h5_parent_group, (h5py.File, h5py.Group)):
            raise TypeError("'h5_parent_group' should either be a h5py.File "
                            "or h5py.Group object")
    else:
        h5_parent_group = h5_main.parent

    tool_name = validate_single_string_arg(tool_name, 'tool_name')

    if '-' in tool_name:
        warn(
            'tool_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(tool_name, tool_name.replace('-', '_')))
    tool_name = tool_name.replace('-', '_')

    group_name = h5_main.name.split('/')[-1] + '-' + tool_name + '_'
    group_name = assign_group_index(h5_parent_group, group_name)

    h5_group = h5_parent_group.create_group(group_name)

    write_book_keeping_attrs(h5_group)

    # Also add some basic attributes like source and tool name. This will allow relaxation of nomenclature restrictions:
    # this are NOT being used right now but will be in the subsequent versions of pyNSID
    write_simple_attrs(h5_group, {'tool': tool_name, 'num_source_dsets': 1})
    # in this case, there is only one source
    if h5_parent_group.file == h5_main.file:
        for dset_ind, dset in enumerate([h5_main]):
            h5_group.attrs['source_' + '{:03d}'.format(dset_ind)] = dset.ref

    return h5_group