def _parse_npy(bio): mmapfile = bio.raw.fileobj if isinstance(mmapfile, mmap.mmap): version = read_magic(bio) _check_version(version) shape, fortran_order, dtype = _read_array_header(bio, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) order = "F" if fortran_order else "C" offset = bio.tell() # Add the offset from the Wrapper file offset += bio.raw.offset data = np.ndarray.__new__( np.memmap, shape, dtype=dtype, buffer=mmapfile, offset=offset, order=order, ) data._mmap = mmapfile data.offset = offset data.mode = "r+" else: b = BytesIO(bio.read()) data = np.load(b) return data
def _read_header(self): with open(self.path, "rb") as fp: version = format.read_magic(fp) try: format._check_version(version) except ValueError: raise ValueError("Invalid file format.") header_data = format._read_array_header(fp, version) self.shape, self.fortran_order, self.dtype = header_data
def __init__(self, path, shape, dtype, axis=0): assert axis == 0 # only concatenation along the first axis is supported right now # Only C order is supported at the moment. self.shape = shape self.dtype = np.dtype(dtype) header = _npy_header(self.shape, self.dtype) version = None _check_version(version) self.fp = open(path, 'wb') _write_array_header(self.fp, header, version)
def _get_info(self): from numpy.lib import format with self.f as fp: version = format.read_magic(fp) format._check_version(version) shape, fortran_order, dtype = format._read_array_header(fp, version) self.shape = shape self.dtype = dtype self.order = 'F' if fortran_order else 'C' self.offset = fp.tell()
def save_large_array(fp, array, axis=0, desc=None): """Save a large, potentially memmapped array, into a NPY file, chunk by chunk to avoid loading it entirely in memory.""" assert axis == 0 # TODO: support other axes version = None _check_version(version) _write_array_header(fp, header_data_from_array_1_0(array), version) N = array.shape[axis] if N == 0: return k = int(ceil(float(N) / 100)) # 100 chunks assert k >= 1 for i in tqdm(range(0, N, k), desc=desc): chunk = array[i:i + k, ...] fp.write(chunk.tobytes())
def load_npy_file(path, block_size): """ Loads a file in npy format (must be 2-dimensional). Parameters ---------- path : str Path to the npy file. block_size : tuple (int, int) Block size of the resulting ds-array. Returns ------- x : ds-array """ try: fid = open(path, "rb") version = format.read_magic(fid) format._check_version(version) shape, fortran_order, dtype = format._read_array_header(fid, version) if fortran_order: raise ValueError("Fortran order not supported for npy files") if len(shape) != 2: raise ValueError("Array is not 2-dimensional") if block_size[0] > shape[0] or block_size[1] > shape[1]: raise ValueError("Block size is larger than the array") blocks = [] n_blocks = int(ceil(shape[1] / block_size[1])) for i in range(0, shape[0], block_size[0]): read_count = min(block_size[0], shape[0] - i) read_size = int(read_count * shape[1] * dtype.itemsize) data = fid.read(read_size) out_blocks = [object() for _ in range(n_blocks)] _read_from_buffer(data, dtype, shape[1], block_size[1], out_blocks) blocks.append(out_blocks) return Array(blocks=blocks, top_left_shape=block_size, reg_shape=block_size, shape=shape, sparse=False) finally: fid.close()
def load_hstack_npy_files(path, cols_per_block=None): """ Loads the .npy files in a directory into a ds-array, stacking them horizontally, like (A|B|C). The order of concatenation is alphanumeric. At least 1 valid .npy file must exist in the directory, and every .npy file must contain a valid array. Every array must have the same dtype, order, and number of rows. The blocks of the returned ds-array will have the same number of rows as the input arrays, and cols_per_block columns, which defaults to the number of columns of the first array. Parameters ---------- path : string Folder path. cols_per_block : tuple (int, int) Number of columns of the blocks for the output ds-array. If None, the number of columns of the first array is used. Returns ------- x : ds-array A distributed representation (ds-array) of the stacked arrays. """ dirlist = os.listdir(path) folder_paths = [os.path.join(path, name) for name in sorted(dirlist)] # Full path of .npy files in the folder files = [ pth for pth in folder_paths if os.path.isfile(pth) and pth[-4:] == '.npy' ] # Read the header of the first file to get shape, order, and dtype with open(files[0], "rb") as fid: version = format.read_magic(fid) format._check_version(version) shape0, order0, dtype0 = format._read_array_header(fid, version) rows = shape0[0] if cols_per_block is None: cols_per_block = shape0[1] # Check that all files have the same number of rows, order and datatype, # and store the number of columns for each file. files_cols = [shape0[1]] for filename in files[1:]: with open(filename, "rb") as fid: version = format.read_magic(fid) format._check_version(version) shape, order, dtype = format._read_array_header(fid, version) if shape[0] != shape0[0] or order0 != order or dtype0 != dtype: raise AssertionError() files_cols.append(shape[1]) # Compute the parameters block_files, start_col and end_col for each block, # and call the task _load_hstack_npy_block() to generate each block. blocks = [] file_idx = 0 start_col = 0 while file_idx < len(files): block_files = [files[file_idx]] cols = files_cols[file_idx] - start_col while cols < cols_per_block: # while block not completed if file_idx + 1 == len(files): # last file break file_idx += 1 block_files.append(files[file_idx]) cols += files_cols[file_idx] # Compute end_col of last file in block (last block may be smaller) end_col = files_cols[file_idx] - max(0, (cols - cols_per_block)) blocks.append(_load_hstack_npy_block(block_files, start_col, end_col)) if end_col == files_cols[file_idx]: # file completed file_idx += 1 start_col = 0 else: # file uncompleted start_col = end_col return Array(blocks=[blocks], top_left_shape=(rows, cols_per_block), reg_shape=(rows, cols_per_block), shape=(rows, sum(files_cols)), sparse=False)