def test_get_signal_chunks_small_dataset(): # Whole dataset fits in one chunk shape = (10, 10, 2, 2) chunks = get_signal_chunks(shape=shape, dtype=np.int32, signal_axes=(2, 3), target_size=1e6) # The chunks must be smaller or equal that the corresponding sizes assert chunks == shape
def test_get_signal_chunks_big_signal(): # One signal exceeds the target size shape = (10, 1000, 5, 1000) chunks = get_signal_chunks(shape=shape, dtype=np.int32, signal_axes=(1, 3), target_size=1e6) # The chunks must be smaller or equal that the corresponding sizes assert chunks == (1, 1000, 1, 1000)
def test_get_signal_chunks(target_size): shape = (2, 150, 3, 200, 1, 600, 1) chunks = get_signal_chunks(shape=shape, dtype=np.int64, signal_axes=(2, 3), target_size=target_size) assert (np.prod(chunks) * 8 < target_size) # The chunks must be smaller or equal that the corresponding sizes assert (np.array(chunks) <= np.array(shape)).all()
def _extract_hdf_dataset(group, dataset, lazy=False): """Import data from hdf path. Parameters ---------- group : hdf group group from which to load the dataset dataset : str path to the dataset within the group lazy : bool {default:True} If true use lazy opening, if false read into memory Returns ------- dask or numpy array """ data = group[dataset] if lazy: if "chunks" in data.attrs.keys(): chunks = data.attrs["chunks"] else: chunks = get_signal_chunks(data.shape, data.dtype) data_lazy = da.from_array(data, chunks=chunks) else: data_lazy = np.array(data) nav_list = [] for i in range(data.ndim): nav_list.append({ 'size': data.shape[i], 'index_in_array': i, 'scale': 1, 'offset': 0.0, 'units': '', 'navigate': True, }) dictionary = { 'data': data_lazy, 'metadata': {}, 'original_metadata': {}, 'axes': nav_list } return dictionary
def _parse_from_file(value, lazy=False): """To convert values from the hdf file to compatible formats. When reading string arrays we convert or keep string arrays as byte_strings (some io_plugins only supports byte-strings arrays so this ensures inter-compatibility across io_plugins) Arrays of length 1 - return the single value stored. Large datasets are returned as dask arrays if lazy=True. Parameters ---------- value : input read from hdf file (array,list,tuple,string,int,float) lazy : bool {default: False} The lazy flag is only applied to values of size >=2 Returns ------- str,int, float, ndarray dask Array parsed value. """ toreturn = value if isinstance(value, h5py.Dataset): if value.size < 2: toreturn = value[...].item() else: if lazy: if value.chunks: toreturn = da.from_array(value, value.chunks) else: chunks = get_signal_chunks(value.shape, value.dtype) toreturn = da.from_array(value, chunks) else: toreturn = np.array(value) if isinstance(toreturn, np.ndarray) and value.shape == (1, ): toreturn = toreturn[0] if isinstance(toreturn, bytes): toreturn = _byte_to_string(toreturn) if isinstance(toreturn, (np.int, np.float)): toreturn = toreturn if isinstance(toreturn, (np.ndarray)) and toreturn.dtype.char == "U": toreturn = toreturn.astype("S") return toreturn
def _nexus_dataset_to_signal(group, nexus_dataset_path, lazy=False): """Load an NXdata set as a hyperspy signal. Parameters ---------- group : hdf group containing the NXdata nexus_data_path : str Path to the NXdata set in the group lazy : bool, default : True lazy loading of data Returns ------- dict A signal dictionary which can be used to instantiate a signal. """ detector_index = 0 interpretation = None dataentry = group[nexus_dataset_path] if "signal" in dataentry.attrs.keys(): if _is_int(dataentry.attrs["signal"]): data_key = "data" else: data_key = dataentry.attrs["signal"] else: _logger.info("No signal attr associated with NXdata will\ try assume signal name is data") if "data" not in dataentry.keys(): raise ValueError("Signal attribute not found in NXdata and\ attempt to find a default \"data\" key failed") else: data_key = "data" if "interpretation" in dataentry.attrs.keys(): interpretation = _parse_from_file(dataentry.attrs["interpretation"]) data = dataentry[data_key] nav_list = [] # list indices... axis_index_list = [] if "axes" in dataentry.attrs.keys(): axes_key = dataentry.attrs["axes"] axes_list = ["."] * data.ndim if isinstance(axes_key, np.ndarray): for i, num in enumerate(axes_key): axes_list[i] = _parse_from_file(num) else: axes_list[0] = _parse_from_file(axes_key) named_axes = list(range(len(axes_list))) for i, ax in enumerate(axes_list): if ax != ".": index_name = ax + "_indices" if index_name in dataentry.attrs: ind_in_array = int(dataentry.attrs[index_name]) else: ind_in_array = i axis_index_list.append(ind_in_array) if "units" in dataentry[ax].attrs: units = _parse_from_file(dataentry[ax].attrs["units"]) else: units = "" navigation = True named_axes.remove(ind_in_array) if _is_numeric_data(dataentry[ax]): if dataentry[ax].size > 1: if _is_linear_axis(dataentry[ax]): nav_list.append({ 'size': data.shape[ind_in_array], 'index_in_array': ind_in_array, 'name': ax, 'scale': abs(dataentry[ax][1] - dataentry[ax][0]), 'offset': min(dataentry[ax][0], dataentry[ax][-1]), 'units': units, 'navigate': navigation }) else: nav_list.append({ 'size': data.shape[ind_in_array], 'index_in_array': ind_in_array, 'name': ax, 'scale': 1, 'offset': 0, 'navigate': navigation }) else: nav_list.append({ 'size': 1, 'index_in_array': ind_in_array, 'name': ax, 'scale': 1, 'offset': dataentry[ax][0], 'units': units, 'navigate': True }) else: if len(data.shape) == len(axes_list): nav_list.append({ 'size': data.shape[named_axes[detector_index]], 'index_in_array': named_axes[detector_index], 'scale': 1, 'offset': 0.0, 'units': '', 'navigate': False }) detector_index = detector_index + 1 if lazy: if "chunks" in data.attrs.keys(): chunks = data.attrs["chunks"] else: chunks = get_signal_chunks(data.shape, data.dtype) data_lazy = da.from_array(data, chunks=chunks) else: data_lazy = np.array(data) if not nav_list: for i in range(data.ndim): nav_list.append({ 'size': data_lazy.shape[i], 'index_in_array': i, 'scale': 1, 'offset': 0.0, 'units': '', 'navigate': True }) title = _text_split(nexus_dataset_path, '/')[-1] metadata = {'General': {'title': title}} # # if interpretation - reset the nav axes # assume the last dimensions are the signal # if interpretation: for x in nav_list: x["navigate"] = True if interpretation == "spectrum": nav_list[-1]["navigate"] = False elif interpretation == "image": nav_list[-1]["navigate"] = False nav_list[-2]["navigate"] = False dictionary = {'data': data_lazy, 'axes': nav_list, 'metadata': metadata} return dictionary
def test_get_signal_chunks(target_size): chunks = get_signal_chunks(shape=[15, 15, 256, 256], dtype=np.int64, signal_axes=(2, 3), target_size=target_size) assert (np.prod(chunks) * 8 < target_size)
def h5ebsd2signaldict(scan_group, manufacturer, version, lazy=False): """Return a dictionary with signal, metadata and original metadata from an h5ebsd scan. Parameters ---------- scan_group : h5py.Group HDF group of scan. manufacturer : {'KikuchiPy', 'EDAX', 'Bruker Nano'} Manufacturer of file. version : str Version of manufacturer software. lazy : bool, optional Returns ------- scan : dictionary Dictionary with patterns, metadata and original metadata. """ md, omd, scan_size = h5ebsdheader2dicts(scan_group, manufacturer, version, lazy) md.set_item('Signal.signal_type', 'EBSD') md.set_item('Signal.record_by', 'image') scan = {'metadata': md.as_dictionary(), 'original_metadata': omd.as_dictionary(), 'attributes': {}} # Get data group man_pats = manufacturer_pattern_names() for man, pats in man_pats.items(): if manufacturer.lower() == man.lower(): data = scan_group['EBSD/Data/' + pats] # Get data from group if lazy: chunks = data.chunks if chunks is None: chunks = get_signal_chunks(data.shape, data.dtype, [1, 2]) data = da.from_array(data, chunks=chunks) scan['attributes']['_lazy'] = True else: data = np.asanyarray(data) sx, sy = scan_size.sx, scan_size.sy nx, ny = scan_size.nx, scan_size.ny try: data = data.reshape((ny, nx, sy, sx)).squeeze() except ValueError: warnings.warn("Pattern size ({} x {}) and scan size ({} x {}) larger " "than file size. Will attempt to load by zero padding " "incomplete frames.".format(sx, sy, nx, ny)) # Data is stored pattern by pattern pw = [(0, ny * nx * sy * sx - data.size)] if lazy: data = da.pad(data, pw, mode='constant') else: data = np.pad(data, pw, mode='constant') data = data.reshape((ny, nx, sy, sx)) scan['data'] = data units = np.repeat(u'\u03BC'+'m', 4) names = ['y', 'x', 'dy', 'dx'] scales = np.ones(4) # Calibrate scan dimension and detector dimension step_x, step_y = scan_size.step_x, scan_size.step_y scales[0] = scales[0] * step_x scales[1] = scales[1] * step_y detector_pixel_size = scan_size.delta scales[2] = scales[2] * detector_pixel_size scales[3] = scales[3] * detector_pixel_size # Create axis objects for each axis axes = [{'size': data.shape[i], 'index_in_array': i, 'name': names[i], 'scale': scales[i], 'offset': 0.0, 'units': units[i]} for i in range(data.ndim)] scan['axes'] = axes return scan