def read_c4_dataset_as_c8(ds: h5py.Dataset, key=np.s_[...]): """ Read a complex float16 HDF5 dataset as a numpy.complex64 array. Avoids h5py/numpy dtype bugs and uses numpy float16 -> float32 conversions which are about 10x faster than HDF5 ones. """ # This context manager avoids h5py exception: # TypeError: data type '<c4' not understood with ds.astype(complex32): z = ds[key] # Define a similar datatype for complex64 to be sure we cast safely. complex64 = np.dtype([("r", np.float32), ("i", np.float32)]) # Cast safely and then view as native complex64 numpy dtype. return z.astype(complex64).view(np.complex64)
def prop_to_dataframe(dset: h5py.Dataset, dtype: DTypeLike = None) -> pd.DataFrame: """Convert the passed property Dataset into a DataFrame. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import HDF5_READ as hdf5_file .. code:: python >>> import h5py >>> from dataCAT import prop_to_dataframe >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'r') as f: ... dset = f['ligand/properties/E_solv'] ... df = prop_to_dataframe(dset) ... print(df) # doctest: +NORMALIZE_WHITESPACE E_solv_names water methanol ethanol ligand ligand anchor O=C=O O1 -0.918837 -0.151129 -0.177396 O3 -0.221182 -0.261591 -0.712906 CCCO O4 -0.314799 -0.784353 -0.190898 Parameters ---------- dset : :class:`h5py.Dataset` The property-containing Dataset of interest. dtype : dtype-like, optional The data type of the to-be returned DataFrame. Use :data:`None` to default to the data type of **dset**. Returns ------- :class:`pandas.DataFrame` A DataFrame constructed from the passed **dset**. """ # noqa: E501 # Construct the index dim0 = dset.dims[0] scale0 = dim0[0] index = index_to_pandas(scale0) # Construct the columns if dset.ndim == 1: full_name = dset.name name = full_name.rsplit('/', 1)[-1] columns = pd.Index([name]) else: dim1 = dset.dims[1] scale1 = dim1[0] columns = pd.Index(scale1[:].astype(str), name=dim1.label) # Create and return the dataframe if dtype is None: return pd.DataFrame(dset[:], index=index, columns=columns) # If possible, let h5py handle the datatype conversion # This will often fail when dset.dtype consists of variable-length bytes-strings try: with dset.astype(dtype): return pd.DataFrame(dset[:], index=index, columns=columns) except (ValueError, TypeError): return pd.DataFrame(dset[:].astype(dtype), index=index, columns=columns)