예제 #1
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
예제 #2
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
예제 #3
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
예제 #4
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is six.text_type
예제 #5
0
파일: __init__.py 프로젝트: vals/anndata
def _decode_structured_array(arr: np.ndarray,
                             dtype: Optional[np.dtype] = None,
                             copy: bool = False) -> np.ndarray:
    """
    h5py 3.0 now reads all strings as bytes. There is a helper method which can convert these to strings,
    but there isn't anything for fields of structured dtypes.

    Params
    ------
    arr
        An array with structured dtype
    dtype
        dtype of the array. This is checked for h5py string data types.
        Passing this is allowed for cases where array may have been processed by another function before hand.
    """
    if copy:
        arr = arr.copy()
    if dtype is None:
        dtype = arr.dtype
    # codecs.decode is 2x slower than this lambda, go figure
    decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
    for k, (dt, _) in dtype.fields.items():
        check = h5py.check_string_dtype(dt)
        if check is not None and check.encoding == "utf-8":
            decode(arr[k], out=arr[k])
    return arr
예제 #6
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
예제 #7
0
    def __getitem__(self, key):
        value = self._node[key]
        if isinstance(value, h5py.Group):
            return HDF5Adapter(value)
        else:
            if value.dtype == numpy.dtype("O"):
                warnings.warn(
                    f"The dataset {key} is of object type, using a "
                    "Python-only feature of h5py that is not supported by "
                    "HDF5 in general. Read more about that feature at "
                    "https://docs.h5py.org/en/stable/special.html. "
                    "Consider using a fixed-length field instead. "
                    "Tiled will serve an empty placeholder, unless the "
                    "object is of size 1, where it will attempt to repackage "
                    "the data into a numpy array."
                )

                check_str_dtype = h5py.check_string_dtype(value.dtype)
                if check_str_dtype.length is None:
                    dataset_names = value.file[self._node.name + "/" + key][...][()]
                    if value.size == 1:
                        arr = MockHDF5Dataset(numpy.array(dataset_names), {})
                        return HDF5DatasetAdapter(arr)
                return HDF5DatasetAdapter(MockHDF5Dataset(numpy.array([]), {}))
            return HDF5DatasetAdapter(value)
예제 #8
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
예제 #9
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is str
예제 #10
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
예제 #11
0
def _read_attr_hdf5(attrs: h5py.AttributeManager,
                    name: str,
                    default: Optional[Any] = Empty):
    """
    Read an HDF5 attribute and perform all necessary conversions.

    At the moment, this only implements conversions for string attributes, other types
    are passed through. String conversion is needed compatibility with other languages.
    For example Julia's HDF5.jl writes string attributes as fixed-size strings, which
    are read as bytes by h5py.
    """
    if name not in attrs and default is not Empty:
        return default
    attr = attrs[name]
    attr_id = attrs.get_id(name)
    dtype = h5py.check_string_dtype(attr_id.dtype)
    if dtype is None:
        return attr
    else:
        if dtype.length is None:  # variable-length string, no problem
            return attr
        elif len(attr_id.shape) == 0:  # Python bytestring
            return attr.decode("utf-8")
        else:  # NumPy array
            return [decode(s, "utf-8") for s in attr]
예제 #12
0
 def test_vlen_bytes(self):
     """ Vlen bytes dataset maps to vlen ascii in the file """
     dt = h5py.string_dtype(encoding='ascii')
     ds = self.f.create_dataset('x', (100,), dtype=dt)
     tid = ds.id.get_type()
     self.assertEqual(type(tid), h5py.h5t.TypeStringID)
     self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)
     string_info = h5py.check_string_dtype(ds.dtype)
     self.assertEqual(string_info.encoding, 'ascii')
예제 #13
0
 def test_vlen_unicode(self):
     """ Vlen unicode dataset maps to vlen utf-8 in the file """
     dt = h5py.string_dtype()
     ds = self.f.create_dataset('x', (100,), dtype=dt)
     tid = ds.id.get_type()
     self.assertEqual(type(tid), h5py.h5t.TypeStringID)
     self.assertEqual(tid.get_cset(), h5py.h5t.CSET_UTF8)
     string_info = h5py.check_string_dtype(ds.dtype)
     self.assertEqual(string_info.encoding, 'utf-8')
예제 #14
0
def h5py_read_string(dataset):
    if version('h5py') >= '3':
        # In h5py >= 3.0.0, h5py no longer converts the data type to a
        # string automatically, and we have to do it manually...
        string_dtype = h5py.check_string_dtype(dataset.dtype)
        if string_dtype is not None and string_dtype.encoding == 'utf-8':
            dataset = dataset.asstr()

    return dataset[()]
예제 #15
0
    def test_compound(self):

        fields = []
        fields.append(('field_1', h5py.string_dtype()))
        fields.append(('field_2', np.int32))
        dt = np.dtype(fields)
        self.f['mytype'] = np.dtype(dt)
        dt_out = self.f['mytype'].dtype.fields['field_1'][0]
        string_inf = h5py.check_string_dtype(dt_out)
        self.assertEqual(string_inf.encoding, 'utf-8')
예제 #16
0
    def test_compound(self):

        fields = []
        fields.append(('field_1', h5py.string_dtype()))
        fields.append(('field_2', np.int32))
        dt = np.dtype(fields)
        self.f['mytype'] = np.dtype(dt)
        dt_out = self.f['mytype'].dtype.fields['field_1'][0]
        string_inf = h5py.check_string_dtype(dt_out)
        self.assertEqual(string_inf.encoding, 'utf-8')
예제 #17
0
    def __getitem__(self, key):
        import h5py

        if key in _HIDDEN_ATTRS:
            raise KeyError(key)
        # see https://github.com/h5netcdf/h5netcdf/issues/94 for details
        if isinstance(self._h5attrs[key], h5py.Empty):
            string_info = h5py.check_string_dtype(self._h5attrs[key].dtype)
            if string_info and string_info.length == 1:
                return b""
        return self._h5attrs[key]
예제 #18
0
파일: storage.py 프로젝트: virelay/corelay
 def _unpack(key, value):
     if key.isdigit():
         key = int(key)
     if isinstance(value, h5py.Dataset):
         check = h5py.check_string_dtype(value.dtype)
         value = value[()]
         if check is not None:
             value = value.decode(check.encoding)
     elif isinstance(value, h5py.Group):
         # Change key to integer if k is digit, so that we can use the dict like a tuple or list
         value = OrderedDict(
             (HDF5Storage._unpack(k, v) for k, v in value.items()))
     return key, value
예제 #19
0
def test_create_dataset(hfile):
    with h5py.File(hfile, "w") as root:
        root.create_dataset("item_name", data="the value")

    assert os.path.exists(hfile)
    tree = h5tree.Hdf5TreeView(hfile)
    assert tree.filename.endswith(hfile)
    assert not tree.isNeXus

    with h5py.File(hfile, "r") as h5root:
        item = h5root["item_name"]
        assert item[()] == b"the value"
        assert h5py.check_string_dtype(item.dtype)
예제 #20
0
 def test_fixed_bytes(self):
     """ Fixed-length bytes dataset maps to fixed-length ascii in the file
     """
     dt = np.dtype("|S10")
     ds = self.f.create_dataset('x', (100,), dtype=dt)
     tid = ds.id.get_type()
     self.assertEqual(type(tid), h5py.h5t.TypeStringID)
     self.assertFalse(tid.is_variable_str())
     self.assertEqual(tid.get_size(), 10)
     self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)
     string_info = h5py.check_string_dtype(ds.dtype)
     self.assertEqual(string_info.encoding, 'ascii')
     self.assertEqual(string_info.length, 10)
예제 #21
0
파일: readers.py 프로젝트: kipoi/kipoi
def _h5py_dataset_iterator(g, prefix=''):
    import h5py
    for key in g:
        item = g[key]
        path = '{}/{}'.format(prefix, key)
        if isinstance(item, h5py.Dataset):  # test for dataset
            if Version(h5py.__version__) > Version('2.10.0'):
                string_type = h5py.check_string_dtype(item.dtype)
                if (string_type is not None) and (string_type.encoding
                                                  == "utf-8"):
                    item = item.asstr()[:]
            yield (path, item)
        elif isinstance(item, h5py.Group):  # test for group (go down)
            for x in _h5py_dataset_iterator(item, path):
                yield x
예제 #22
0
    def _column(self, k):
        cached_data = self._cache.get(k)
        if cached_data is not None:
            return cached_data

        dset = self._hf.get(k)
        if dset is not None:
            if h5py.check_string_dtype(dset.dtype):
                py_data = dset.asstr()
            else:
                enum_dict = h5py.check_enum_dtype(dset.dtype)
                if enum_dict:
                    inv_enum_dict = dict((i, k) for k, i in enum_dict.items())
                    py_data = [inv_enum_dict[x] for x in np.array(dset)]
                else:
                    py_data = np.array(dset)
        else:
            py_data = None

        self._cache[k] = py_data
        return py_data
예제 #23
0
def hdfgetdata(gID,field):
        val = gID.get(field)
        if val is None:
            return val

        if h5py.check_string_dtype(val.dtype):
            # string
            if val.len()==1:
                val=val[0].tobytes().decode('ascii')
                return val
            else:
                val2=[];
                for x in val:
                    val2.append(x.tobytes().decode('ascii'))
                val2=np.array(val2)
                return val2
        val=np.array(val)

        if(val.ndim==1 and len(val)==1):
            val=val[0]

        return val
예제 #24
0
def _update_pdb_dsets(file: h5py.File, name: str,
                      logger: Optional[Logger] = None) -> Optional[PDBContainer]:
    """Check for and update pre dataCAT 0.3 style databases."""
    if not isinstance(file.get(name), h5py.Dataset):
        return None
    elif logger is not None:
        logger.info(f'Updating h5py Dataset to data-CAT >= 0.3 style: {name!r}')

    mol_list = [from_pdb_array(pdb, rdmol=False, warn=False) for pdb in file[name]]
    m = len(mol_list)
    del file[name]

    dtype = IDX_DTYPE[name]
    scale = np.rec.array(None, dtype=dtype, shape=(m,))
    if dtype.fields is not None and scale.size:
        # Ensure that the sentinal value for vlen strings is an empty string, not `None`
        elem = list(scale.item(0))
        iterator = (v for v, *_ in dtype.fields.values())
        for i, sub_dt in enumerate(iterator):
            if h5py.check_string_dtype(sub_dt) is not None:
                elem[i] = ''
        scale[:] = tuple(elem)
    return PDBContainer.from_molecules(mol_list, scale=scale)
예제 #25
0
def read_dataset(dataset: h5py.Dataset):
    if H5PY_V3:
        string_dtype = h5py.check_string_dtype(dataset.dtype)
        if (string_dtype is not None) and (string_dtype.encoding == "utf-8"):
            dataset = dataset.asstr()
    value = dataset[()]
    if not hasattr(value, "dtype"):
        return value
    elif isinstance(value.dtype, str):
        pass
    elif issubclass(value.dtype.type, np.string_):
        value = value.astype(str)
        # Backwards compat, old datasets have strings as one element 1d arrays
        if len(value) == 1:
            return value[0]
    elif len(value.dtype.descr) > 1:  # Compound dtype
        # For backwards compat, now strings are written as variable length
        dtype = value.dtype
        value = _from_fixed_length_strings(value)
        if H5PY_V3:
            value = _decode_structured_array(value, dtype=dtype)
    if value.shape == ():
        value = value[()]
    return value
예제 #26
0
 def __getitem__(self, key):
     if getattr(self._root, "decode_vlen_strings", False):
         string_info = h5py.check_string_dtype(self._h5ds.dtype)
         if string_info and string_info.length is None:
             return self._h5ds.asstr()[key]
     return self._h5ds[key]
예제 #27
0
def index_to_pandas(dset: h5py.Dataset,
                    fields: None | Sequence[str] = None) -> pd.MultiIndex:
    """Construct an MultiIndex from the passed ``index`` dataset.

    Examples
    --------
    .. testsetup:: python

        >>> from dataCAT.testing_utils import HDF5_READ as filename

    .. code:: python

        >>> from dataCAT import index_to_pandas
        >>> import h5py

        >>> filename = str(...)  # doctest: +SKIP

        # Convert the entire dataset
        >>> with h5py.File(filename, "r") as f:
        ...     dset: h5py.Dataset = f["ligand"]["index"]
        ...     index_to_pandas(dset)
        MultiIndex([('O=C=O', 'O1'),
                    ('O=C=O', 'O3'),
                    ( 'CCCO', 'O4')],
                   names=['ligand', 'ligand anchor'])

        # Convert a subset of fields
        >>> with h5py.File(filename, "r") as f:
        ...     dset = f["ligand"]["index"]
        ...     index_to_pandas(dset, fields=["ligand"])
        MultiIndex([('O=C=O',),
                    ('O=C=O',),
                    ( 'CCCO',)],
                   names=['ligand'])

    Parameters
    ----------
    dset : :class:`h5py.Dataset`
        The relevant ``index`` dataset.
    fields : :class:`Sequence[str]<collections.abc.Sequence>`
        The names of the ``index`` fields that are to-be included in the
        returned MultiIndex. If :data:`None`, include all fields.

    Returns
    -------
    :class:`pandas.MultiIndex`
        A multi-index constructed from the passed dataset.

    """
    # Fast-path for non-void-based datasets
    if dset.dtype.fields is None:
        if h5py.check_string_dtype(dset.dtype):
            ar = dset[:].astype(str)
        elif h5py.check_vlen_dtype(dset.dtype):
            ar = _vlen_to_tuples(dset[:])
        else:
            ar = dset[:]
        return pd.MultiIndex.from_arrays([ar])

    # Parse the `fields` parameter
    if fields is None:
        field_names = list(dset.dtype.fields.keys())
        iterator = ((name, f_dtype)
                    for name, (f_dtype, *_) in dset.dtype.fields.items())
    else:
        field_names = list(fields)
        iterator = ((name, dset.dtype.fields[name][0]) for name in fields)
    if len(field_names) == 0:
        raise ValueError("At least one field is required")

    fields_lst = []
    index_ar = dset[:]
    for name, field_dtype in iterator:
        # It's a bytes-string; decode it
        if h5py.check_string_dtype(field_dtype):
            ar = index_ar[name].astype(str)

        # It's a h5py `vlen` dtype; convert it into a list of tuples
        elif h5py.check_vlen_dtype(field_dtype):
            ar = _vlen_to_tuples(index_ar[name])

        else:
            ar = index_ar[name]
        fields_lst.append(ar)
    return pd.MultiIndex.from_arrays(fields_lst, names=field_names)
예제 #28
0
def validate(filename,fileOut=None):
     fileID = h5py.File(filename, 'r')
     formatVersion=hdfgetdata(fileID,"/formatVersion")

     def getallnames(gID,lst):
         if isinstance(gID, h5py.Dataset):
             lst.append(gID.name)
         else:
            for x in gID:
                getallnames(gID[x],lst)

     def checkdim(field,fID,foundInvalid,lstInvalid):
         val = fID.get(field);

         if "Pos2D" in field or "Pos3D" in field:
             dim = 2
         elif "dataTimeSeries" in field:
             dim = 2
         else:
             dim = 1
         if dim != len(val.dims):
             return False

     lst=[]

     getallnames(fileID,lst)

     if fileOut == None:
         print('-' * 40)
         print('SNIRF Validator')
         print('Version 1.0')
         print('written by T. Huppert')
         print()
         print('File = {0}'.format(filename))
         print('Version = {0}'.format(formatVersion))
         print('-' * 40)

         foundInvalid=0;

         lstInvalid=[]

         for x in lst:
            print(Fore.WHITE + x)
            val = fileID.get(x)
            if h5py.check_string_dtype(val.dtype):
                # string
                if val.len()==1:
                    val=val[0].tobytes().decode('ascii')
                    print('\tHDF5-STRING: {0}'.format(val))
                else:
                    val2=[];
                    for y in val:
                        val2.append(y.tobytes().decode('ascii'))
                    val2=np.array(val2)
                    print('\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2)))
            else:
                val=np.array(val)
                if(val.ndim==1 and len(val)==1):
                    val=val[0]
                    print('\tHDF5-FLOAT: {0}'.format(val))
                elif val.ndim==1:

                     print('\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val)))
                else:
                     print('\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(len(val),int(val.size/len(val))))

            dimcheck = checkdim(x, fileID, foundInvalid, lstInvalid)
            if dimcheck == False:
                val = len(fileID.get(x).dims)
                if val == 1:
                    print(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 2)')
                else:
                    print(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 1)')
                foundInvalid=foundInvalid+1;
                lstInvalid.append(x)

            if "/aux" in x or "/stim" in x:
                if isrequired(x) == True:
                    print(Fore.BLUE + '\t\tRequired field when optional parent object is included')
                elif isoptional(x):
                    print(Fore.GREEN + '\t\tOptional field when optional parent object is included')
                else:
                    print(Fore.RED + '\t\tINVALID field')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            else:
                if isrequired(x) == True:
                    print(Fore.BLUE + '\t\tRequired field')
                elif isoptional(x):
                    print(Fore.GREEN + '\t\tOptional field')
                else:
                    print(Fore.RED + '\t\tINVALID field')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)

         print('-' * 40)
         if(len(lstInvalid)!=0):
              print(Fore.RED+ "File is INVALID")
              print(Fore.RED +'\tINVALID ENTRIES FOUND')
              for x in lstInvalid:
                  print(Fore.RED + x)
         else:
              print(Fore.WHITE+ "File is VALID")
         print(Style.RESET_ALL)
     else: # write to file
         text_file = open(fileOut, "w")
         text_file.write('\n' + '\n' + '-' * 40)
         text_file.write('\n' + '\n' + 'SNIRF Validator')
         text_file.write('\n' + '\n' + 'Version 1.0')
         text_file.write('\n' + 'written by T. Huppert')
         text_file.write('\n')
         text_file.write('\n' + 'File = {0}'.format(filename))
         text_file.write('\n' + 'Version = {0}'.format(formatVersion))
         text_file.write('\n' + '-' * 40)

         foundInvalid=0;

         lstInvalid=[]

         for x in lst:
            text_file.write('\n' + x)
            val = fileID.get(x)
            if h5py.check_string_dtype(val.dtype):
                # string
                if val.len()==1:
                    val=val[0].tobytes().decode('ascii')
                    text_file.write('\n' + '\tHDF5-STRING: {0}'.format(val))
                else:
                    val2=[];
                    for y in val:
                        val2.append(y.tobytes().decode('ascii'))
                    val2=np.array(val2)
                    text_file.write('\n' + '\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2)))
            else:
                val=np.array(val)
                if(val.ndim==1 and len(val)==1):
                    val=val[0]
                    text_file.write('\n' + '\tHDF5-FLOAT: {0}'.format(val))
                elif val.ndim==1:

                     text_file.write('\n' + '\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val)))
                else:
                     text_file.write('\n' + '\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(len(val),int(val.size/len(val))))

            dimcheck = checkdim(x, fileID, foundInvalid, lstInvalid)
            if dimcheck == False:
                val = len(fileID.get(x).dims)
                if val == 1:
                    text_file.write(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 2)')
                else:
                    text_file.write(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 1)')
                foundInvalid=foundInvalid+1
                lstInvalid.append(x)

            if isrequired(x)==True:
                text_file.write('\n' + '\t\tRequired field')
            elif isoptional(x):
                text_file.write('\n' + '\t\tOptional field')
            else:
                text_file.write('\n' + '\t\tINVALID field')
                foundInvalid=foundInvalid+1
                lstInvalid.append(x)

         text_file.write('\n' + '-' * 40)
         if(len(lstInvalid)!=0):
              text_file.write('\n' + "File is INVALID")
              text_file.write('\n' + '\tINVALID ENTRIES FOUND')
              for x in lstInvalid:
                  text_file.write('\n' +  x)
         else:
              text_file.write('\n' + "File is VALID")
         text_file.close()
     return (foundInvalid==0)
예제 #29
0
def validate(filename, fileOut=None):
    fileID = h5py.File(filename, 'r')
    formatVersion = hdfgetdata(fileID, "/formatVersion")

    def getallnames(gID, lst):
        if isinstance(gID, h5py.Dataset):
            lst.append(gID.name)
        else:
            for x in gID:
                getallnames(gID[x], lst)

    lst = []

    getallnames(fileID, lst)

    if fileOut == None:
        print('-' * 40)
        print('SNIRF Validator')
        print('Version 1.0')
        print('written by T. Huppert')
        print()
        print('File = {0}'.format(filename))
        print('Version = {0}'.format(formatVersion))
        print('-' * 40)

        foundInvalid = 0

        lstInvalid = []

        for x in lst:
            print(Fore.WHITE + x)
            val = fileID.get(x)
            if h5py.check_string_dtype(val.dtype):
                # string
                if val.len() == 1:
                    val = val[0].tostring().decode('ascii')
                    print('\tHDF5-STRING: {0}'.format(val))
                else:
                    val2 = []
                    for y in val:
                        val2.append(y.tostring().decode('ascii'))
                    val2 = np.array(val2)
                    print('\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2)))
            else:
                val = np.array(val)
                if (val.ndim == 1 and len(val) == 1):
                    val = val[0]
                    print('\tHDF5-FLOAT: {0}'.format(val))
                elif val.ndim == 1:

                    print('\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val)))
                else:
                    print('\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(
                        len(val), int(val.size / len(val))))

            if "Pos2D" in x:
                if (val.ndim != 2 or int(val.size / len(val))) != 2:
                    print('\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            if "Pos3D" in x:
                if (val.ndim != 2 or int(val.size / len(val))) != 3:
                    print(Fore.RED + '\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            if "dataTimeSeries" in x:
                if int(val.size / len(val)) > len(val):
                    print(Fore.RED + '\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            if ("stim" in x) and ("data" in x):
                if (val.ndim != 2 or int(val.size / len(val))) != 3:
                    print(
                        Fore.RED +
                        '\tPossible transpose.  Should be <#trials x [onset, duration, amplitude, ...] >'
                    )
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)

            if isrequired(x) == True:
                print(Fore.BLUE + '\t\tRequired field')
            elif isoptional(x):
                print(Fore.GREEN + '\t\tOptional field')
            else:
                print(Fore.RED + '\t\tINVALID field')
                foundInvalid = foundInvalid + 1
                lstInvalid.append(x)

        print('-' * 40)
        if (len(lstInvalid) != 0):
            print(Fore.RED + "File is INVALID")
            print(Fore.RED + '\tINVALID ENTRIES FOUND')
            for x in lstInvalid:
                print(Fore.RED + x)
        else:
            print(Fore.WHITE + "File is VALID")
    else:  # write to file
        text_file = open(fileOut, "w")
        text_file.write('\n' + '\n' + '-' * 40)
        text_file.write('\n' + '\n' + 'SNIRF Validator')
        text_file.write('\n' + '\n' + 'Version 1.0')
        text_file.write('\n' + 'written by T. Huppert')
        text_file.write('\n')
        text_file.write('\n' + 'File = {0}'.format(filename))
        text_file.write('\n' + 'Version = {0}'.format(formatVersion))
        text_file.write('\n' + '-' * 40)

        foundInvalid = 0

        lstInvalid = []

        for x in lst:
            text_file.write('\n' + x)
            val = fileID.get(x)
            if h5py.check_string_dtype(val.dtype):
                # string
                if val.len() == 1:
                    val = val[0].tostring().decode('ascii')
                    text_file.write('\n' + '\tHDF5-STRING: {0}'.format(val))
                else:
                    val2 = []
                    for y in val:
                        val2.append(y.tostring().decode('ascii'))
                    val2 = np.array(val2)
                    text_file.write(
                        '\n' +
                        '\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2)))
            else:
                val = np.array(val)
                if (val.ndim == 1 and len(val) == 1):
                    val = val[0]
                    text_file.write('\n' + '\tHDF5-FLOAT: {0}'.format(val))
                elif val.ndim == 1:

                    text_file.write(
                        '\n' +
                        '\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val)))
                else:
                    text_file.write('\n' +
                                    '\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(
                                        len(val), int(val.size / len(val))))

            if "Pos2D" in x:
                if (val.ndim != 2 or int(val.size / len(val))) != 2:
                    text_file.write('\n' + '\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            if "Pos3D" in x:
                if (val.ndim != 2 or int(val.size / len(val))) != 3:
                    text_file.write('\n' + '\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)
            if "dataTimeSeries" in x:
                if int(val.size / len(val)) > len(val):
                    text_file.write('\n' + '\tINVALID dimensions')
                    foundInvalid = foundInvalid + 1
                    lstInvalid.append(x)

            if isrequired(x) == True:
                text_file.write('\n' + '\t\tRequired field')
            elif isoptional(x):
                text_file.write('\n' + '\t\tOptional field')
            else:
                text_file.write('\n' + '\t\tINVALID field')
                foundInvalid = foundInvalid + 1
                lstInvalid.append(x)

        text_file.write('\n' + '-' * 40)
        if (len(lstInvalid) != 0):
            text_file.write('\n' + "File is INVALID")
            text_file.write('\n' + '\tINVALID ENTRIES FOUND')
            for x in lstInvalid:
                text_file.write('\n' + x)
        else:
            text_file.write('\n' + "File is VALID")
        text_file.close()

    return (foundInvalid == 0)
예제 #30
0
    def _group2dict(self, group, dictionary=None, lazy=False):
        if dictionary is None:
            dictionary = {}
        for key, value in group.attrs.items():
            if isinstance(value, bytes):
                value = value.decode()
            if isinstance(value, (np.string_, str)):
                if value == '_None_':
                    value = None
            elif isinstance(value, np.bool_):
                value = bool(value)
            elif isinstance(value, np.ndarray) and value.dtype.char == "S":
                # Convert strings to unicode
                value = value.astype("U")
                if value.dtype.str.endswith("U1"):
                    value = value.tolist()
            # skip signals - these are handled below.
            if key.startswith('_sig_'):
                pass
            elif key.startswith('_list_empty_'):
                dictionary[key[len('_list_empty_'):]] = []
            elif key.startswith('_tuple_empty_'):
                dictionary[key[len('_tuple_empty_'):]] = ()
            elif key.startswith('_bs_'):
                dictionary[key[len('_bs_'):]] = value.tobytes()
            # The following two elif stataments enable reading date and time from
            # v < 2 of HyperSpy's metadata specifications
            elif key.startswith('_datetime_date'):
                date_iso = datetime.date(
                    *ast.literal_eval(value[value.index("("):])).isoformat()
                dictionary[key.replace("_datetime_", "")] = date_iso
            elif key.startswith('_datetime_time'):
                date_iso = datetime.time(
                    *ast.literal_eval(value[value.index("("):])).isoformat()
                dictionary[key.replace("_datetime_", "")] = date_iso
            else:
                dictionary[key] = value
        if not isinstance(group, self.Dataset):
            for key in group.keys():
                if key.startswith('_sig_'):
                    from hyperspy.io import dict2signal
                    dictionary[key[len('_sig_'):]] = (
                        dict2signal(self.group2signaldict(
                            group[key], lazy=lazy)))
                elif isinstance(group[key], self.Dataset):
                    dat = group[key]
                    kn = key
                    if key.startswith("_list_"):
                        if (h5py.check_string_dtype(dat.dtype) and
                                hasattr(dat, 'asstr')):
                            # h5py 3.0 and newer
                            # https://docs.h5py.org/en/3.0.0/strings.html
                            dat = dat.asstr()[:]
                        ans = np.array(dat)
                        ans = ans.tolist()
                        kn = key[6:]
                    elif key.startswith("_tuple_"):
                        ans = np.array(dat)
                        ans = tuple(ans.tolist())
                        kn = key[7:]
                    elif dat.dtype.char == "S":
                        ans = np.array(dat)
                        try:
                            ans = ans.astype("U")
                        except UnicodeDecodeError:
                            # There are some strings that must stay in binary,
                            # for example dill pickles. This will obviously also
                            # let "wrong" binary string fail somewhere else...
                            pass
                    elif lazy:
                        ans = da.from_array(dat, chunks=dat.chunks)
                    else:
                        ans = np.array(dat)
                    dictionary[kn] = ans
                elif key.startswith('_hspy_AxesManager_'):
                    dictionary[key[len('_hspy_AxesManager_'):]] = AxesManager(
                        [i for k, i in sorted(iter(
                            self._group2dict(
                                group[key], lazy=lazy).items()
                        ))])
                elif key.startswith('_list_'):
                    dictionary[key[7 + key[6:].find('_'):]] = \
                        [i for k, i in sorted(iter(
                            self._group2dict(
                                group[key], lazy=lazy).items()
                        ))]
                elif key.startswith('_tuple_'):
                    dictionary[key[8 + key[7:].find('_'):]] = tuple(
                        [i for k, i in sorted(iter(
                            self._group2dict(
                                group[key], lazy=lazy).items()
                        ))])
                else:
                    dictionary[key] = {}
                    self._group2dict(
                        group[key],
                        dictionary[key],
                        lazy=lazy)

        return dictionary
예제 #31
0
    def create_zarr_hierarchy(self, h5py_group, zgroup):
        """  Scan hdf5 file and recursively create zarr attributes, groups and dataset structures for accessing data
        Args:
          h5py_group: h5py.Group or h5py.File object where information is gathered from
          zgroup:     Zarr Group
        """

        if (not isinstance(h5py_group, h5py.File) and
            (not issubclass(self.file.get(
                h5py_group.name, getclass=True), h5py.Group) or not issubclass(
                    self.file.get(h5py_group.name, getclass=True,
                                  getlink=True), h5py.HardLink))):
            raise TypeError(
                f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink"
            )

        self.copy_attrs_data_to_zarr_store(h5py_group, zgroup)

        # add hdf5 group address in file to self._address_dict
        self._address_dict[h5py.h5o.get_info(
            h5py_group.id).addr] = h5py_group.name

        # iterate through group members
        test_iter = [name for name in h5py_group.keys()]
        for name in test_iter:
            obj = h5py_group[name]

            # get group member's link class
            obj_linkclass = h5py_group.get(name, getclass=True, getlink=True)

            # Datasets
            # TO DO, Soft Links #
            if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(
                        f"Dataset {obj.name} is not processed: External Link")
                    continue
                dset = obj

                # number of filters
                dcpl = dset.id.get_create_plist()
                nfilters = dcpl.get_nfilters()
                if nfilters > 1:
                    # TO DO #
                    print(
                        f"Dataset {dset.name} with multiple filters is not processed"
                    )
                    continue
                elif nfilters == 1:
                    # get first filter information
                    filter_tuple = dset.id.get_create_plist().get_filter(0)
                    filter_code = filter_tuple[0]
                    if filter_code in self._hdf5_regfilters_subset and self._hdf5_regfilters_subset[
                            filter_code] is not None:
                        # TO DO
                        if filter_code == 32001:
                            # Blosc
                            blosc_names = {
                                0: 'blosclz',
                                1: 'lz4',
                                2: 'lz4hc',
                                3: 'snappy',
                                4: 'zlib',
                                5: 'zstd'
                            }
                            clevel, shuffle, cname_id = filter_tuple[2][-3:]
                            cname = blosc_names[cname_id]
                            compression = self._hdf5_regfilters_subset[
                                filter_code](cname=cname,
                                             clevel=clevel,
                                             shuffle=shuffle)
                        else:
                            compression = self._hdf5_regfilters_subset[
                                filter_code](level=filter_tuple[2])
                    else:
                        print(
                            f"Dataset {dset.name} with compression filter {filter_tuple[3]}, hdf5 filter number {filter_tuple[0]} is not processed:\
                                no compatible zarr codec")
                        continue
                else:
                    compression = None

                object_codec = None

                if dset.dtype.names is not None:
                    # Structured array with Reference dtype

                    dset_type = dset.id.get_type()
                    dt_nmembers = dset_type.get_nmembers()

                    dtype_ = []
                    dset_fillvalue = list(dset.fillvalue)
                    for dt_i in range(dt_nmembers):
                        dtname = dset.dtype.names[dt_i]
                        if dset_type.get_member_class(
                                dt_i) == h5py.h5t.REFERENCE:
                            fcid = dset.file.id.get_create_plist()
                            unit_address_size, _ = fcid.get_sizes()
                            dtype_ += [(dtname,
                                        np.dtype(f'uint{unit_address_size*8}'))
                                       ]
                            if dset.fillvalue[dt_i]:
                                dset_fillvalue[dt_i] = h5py.h5o.get_info([
                                    h5py.h5r.dereference(
                                        dset.fillvalue[dt_i], self.file.id)
                                ]).addr
                            else:
                                dset_fillvalue[dt_i] = 0
                        else:
                            dtype_ += [(dtname, dset.dtype.base[dt_i])]
                    zarray = zgroup.create_dataset(
                        dset.name,
                        shape=dset.shape,
                        dtype=dtype_,
                        chunks=dset.chunks or False,
                        fill_value=tuple(dset_fillvalue),
                        compression=compression,
                        overwrite=True)

                # variable-length Datasets
                elif h5py.check_vlen_dtype(dset.dtype):
                    if not h5py.check_string_dtype(dset.dtype):
                        print(
                            f"Dataset {dset.name} is not processed: Variable-length dataset, not string"
                        )
                        continue
                    else:
                        object_codec = VLenHDF5String()
                        zarray = zgroup.create_dataset(
                            dset.name,
                            shape=dset.shape,
                            dtype=object,
                            chunks=dset.chunks or False,
                            fill_value=dset.fillvalue,
                            compression=compression,
                            overwrite=True,
                            object_codec=object_codec)
                        dset_chunks = dset.chunks

                elif dset.dtype.hasobject:
                    # TO DO test #
                    dset_type = dset.id.get_type()

                    if dset_type.get_class() == h5py.h5t.REFERENCE:
                        fcid = dset.file.id.get_create_plist()
                        unit_address_size, _ = fcid.get_sizes()
                        dtype_ = np.dtype(f'uint{unit_address_size*8}')
                        if dset.fillvalue:
                            dset_fillvalue = h5py.h5o.get_info([
                                h5py.h5r.dereference(dset.fillvalue,
                                                     self.file.id)
                            ]).addr
                        else:
                            dset_fillvalue = 0

                        zarray = zgroup.create_dataset(
                            dset.name,
                            shape=dset.shape,
                            dtype=dtype_,
                            chunks=dset.chunks or False,
                            fill_value=dset_fillvalue,
                            compression=compression,
                            overwrite=True)

                    elif dset_type.get_class() == h5py.h5t.STD_REF_DSETREG:
                        print(
                            f"Dataset {dset.name} is not processed: Region Reference dtype"
                        )
                        continue
                    else:
                        print(
                            f"Dataset {dset.name} is not processed: Object dtype"
                        )
                        continue

                else:
                    if compression is None and (dset.chunks is None
                                                or dset.chunks == dset.shape):

                        dset_chunks = dset.chunks if dset.chunks else dset.shape
                        if dset.shape != ():
                            dset_chunks = list(dset_chunks)
                            dim_ = 0
                            ratio_ = self.max_chunksize / (
                                np.prod(dset_chunks) * dset.dtype.itemsize)
                            while ratio_ < 1:
                                chunk_dim_ = int(ratio_ * dset_chunks[dim_])
                                chunk_dim_ = chunk_dim_ if chunk_dim_ else 1
                                chunk_dim_ -= np.argmax(
                                    dset_chunks[dim_] %
                                    np.arange(chunk_dim_, chunk_dim_ // 2, -1))
                                dset_chunks[dim_] = int(chunk_dim_)
                                ratio_ = self.max_chunksize / (
                                    np.prod(dset_chunks) * dset.dtype.itemsize)
                                dim_ += 1

                            dset_chunks = tuple(dset_chunks)
                        dset_chunks = dset_chunks or None
                    else:
                        dset_chunks = dset.chunks

                    zarray = zgroup.create_dataset(dset.name,
                                                   shape=dset.shape,
                                                   dtype=dset.dtype,
                                                   chunks=dset_chunks or False,
                                                   fill_value=dset.fillvalue,
                                                   compression=compression,
                                                   overwrite=True)

                self.copy_attrs_data_to_zarr_store(dset, zarray)
                info = self.storage_info(dset, dset_chunks)

                if object_codec is not None:
                    info = self.vlen_storage_info(dset, info)

                # Store metadata
                if info:
                    info['source'] = {'uri': self.uri, 'array_name': dset.name}
                    FileChunkStore.chunks_info(zarray, info)

            # Groups
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and not issubclass(obj_linkclass, h5py.SoftLink)):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(f"Group {obj.name} is not processed: External Link")
                    continue
                group_ = obj
                zgroup_ = self.zgroup.create_group(group_.name, overwrite=True)
                self.create_zarr_hierarchy(group_, zgroup_)

            # Groups, Soft Link
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and issubclass(obj_linkclass, h5py.SoftLink)):
                group_ = obj
                zgroup_ = self.zgroup.create_group(group_.name, overwrite=True)
                self.copy_attrs_data_to_zarr_store(group_, zgroup_)

                zgroup_path = zgroup_.create_group(SYMLINK, overwrite=True)
                zgroup_path.attrs[group_.name] = h5py_group.get(
                    name, getlink=True).path
예제 #32
0
    def _rewrite_vlen_to_fixed(h5py_group, changed_dsets={}):
        """  Scan hdf5 file or hdf5 group object and recursively convert variable-length string dataset to fixed-length
        Args:
          h5py_group: h5py.Group or h5py.File object
        """

        if (not isinstance(h5py_group, h5py.File) and (not issubclass(
                h5py_group.file.get(h5py_group.name, getclass=True),
                h5py.Group) or not issubclass(
                    h5py_group.file.get(h5py_group.name,
                                        getclass=True,
                                        getlink=True), h5py.HardLink))):
            raise TypeError(
                f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink"
            )

        # iterate through group members
        group_iter = [name for name in h5py_group.keys()]
        for name in group_iter:
            obj = h5py_group[name]

            # get group member's link class
            obj_linkclass = h5py_group.get(name, getclass=True, getlink=True)

            # Datasets
            if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(
                        f"Skipped rewriting variable-length dataset {obj.name}: External Link"
                    )
                    continue
                dset = obj

                # variable-length Datasets
                if h5py.check_vlen_dtype(
                        dset.dtype) and h5py.check_string_dtype(dset.dtype):

                    vlen_stringarr = dset[()]
                    if dset.shape == ():
                        string_lengths_ = len(vlen_stringarr)
                        length_max = string_lengths_
                    else:
                        length_max = max(
                            len(el) for el in vlen_stringarr.flatten())
                    if dset.fillvalue is not None:
                        length_max = max(length_max, len(dset.fillvalue))
                    length_max = length_max + (-length_max) % 8
                    dt_fixedlen = f'|S{length_max}'

                    if isinstance(dset.fillvalue, str):
                        dset_fillvalue = dset.fillvalue.encode('utf-8')
                    else:
                        dset_fillvalue = dset.fillvalue

                    affix_ = '_fixedlen~'
                    dset_name = dset.name
                    h5py_group.file.move(dset_name, dset_name + affix_)
                    changed_dsets[dset_name + affix_] = dset_name
                    dsetf = h5py_group.file.create_dataset_like(
                        dset_name,
                        dset,
                        dtype=dt_fixedlen,
                        fillvalue=dset_fillvalue)

                    # TO DO, copy attrs after all string dataset are moved
                    for key, val in dset.attrs.items():
                        if isinstance(
                                val,
                            (bytes, np.bool_, str, int, float, np.number)):
                            dsetf.attrs[key] = val
                        else:
                            # TO DO #
                            print(
                                f"Moving variable-length string Datasets: attribute value of type\
                                    {type(val)} is not processed. Attribute {key} of object {dsetf.name}"
                            )

                    if dsetf.shape == ():
                        if isinstance(vlen_stringarr, bytes):
                            dsetf[...] = vlen_stringarr
                        else:
                            dsetf[...] = vlen_stringarr.encode('utf-8')
                    else:
                        dsetf[...] = vlen_stringarr.astype(dt_fixedlen)

            # Groups
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and not issubclass(obj_linkclass, h5py.SoftLink)):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(f"Group {obj.name} is not processed: External Link")
                    continue
                changed_dsets = HDF5Zarr._rewrite_vlen_to_fixed(
                    obj, changed_dsets)

        return changed_dsets