示例#1
0
    def _parse_npy(bio):
        mmapfile = bio.raw.fileobj
        if isinstance(mmapfile, mmap.mmap):
            version = read_magic(bio)
            _check_version(version)

            shape, fortran_order, dtype = _read_array_header(bio, version)
            if dtype.hasobject:
                msg = "Array can't be memory-mapped: Python objects in dtype."
                raise ValueError(msg)
            order = "F" if fortran_order else "C"
            offset = bio.tell()
            # Add the offset from the Wrapper file
            offset += bio.raw.offset
            data = np.ndarray.__new__(
                np.memmap,
                shape,
                dtype=dtype,
                buffer=mmapfile,
                offset=offset,
                order=order,
            )
            data._mmap = mmapfile
            data.offset = offset
            data.mode = "r+"
        else:
            b = BytesIO(bio.read())
            data = np.load(b)

        return data
示例#2
0
 def _read_header(self):
     with open(self.path, "rb") as fp:
         version = format.read_magic(fp)
         try:
             format._check_version(version)
         except ValueError:
             raise ValueError("Invalid file format.")
         header_data = format._read_array_header(fp, version)
         self.shape, self.fortran_order, self.dtype = header_data
示例#3
0
 def __init__(self, path, shape, dtype, axis=0):
     assert axis == 0  # only concatenation along the first axis is supported right now
     # Only C order is supported at the moment.
     self.shape = shape
     self.dtype = np.dtype(dtype)
     header = _npy_header(self.shape, self.dtype)
     version = None
     _check_version(version)
     self.fp = open(path, 'wb')
     _write_array_header(self.fp, header, version)
示例#4
0
    def _get_info(self):
        from numpy.lib import format
        with self.f as fp:
            version = format.read_magic(fp)
            format._check_version(version)

            shape, fortran_order, dtype = format._read_array_header(fp, version)
            self.shape = shape
            self.dtype = dtype
            self.order = 'F' if fortran_order else 'C'
            self.offset = fp.tell()
示例#5
0
def save_large_array(fp, array, axis=0, desc=None):
    """Save a large, potentially memmapped array, into a NPY file, chunk by chunk to avoid loading
    it entirely in memory."""
    assert axis == 0  # TODO: support other axes
    version = None
    _check_version(version)
    _write_array_header(fp, header_data_from_array_1_0(array), version)
    N = array.shape[axis]
    if N == 0:
        return

    k = int(ceil(float(N) / 100))  # 100 chunks
    assert k >= 1
    for i in tqdm(range(0, N, k), desc=desc):
        chunk = array[i:i + k, ...]
        fp.write(chunk.tobytes())
示例#6
0
文件: io.py 项目: vibhatha/dislib
def load_npy_file(path, block_size):
    """ Loads a file in npy format (must be 2-dimensional).

    Parameters
    ----------
    path : str
        Path to the npy file.
    block_size : tuple (int, int)
        Block size of the resulting ds-array.

    Returns
    -------
    x : ds-array
    """
    try:
        fid = open(path, "rb")
        version = format.read_magic(fid)
        format._check_version(version)
        shape, fortran_order, dtype = format._read_array_header(fid, version)

        if fortran_order:
            raise ValueError("Fortran order not supported for npy files")

        if len(shape) != 2:
            raise ValueError("Array is not 2-dimensional")

        if block_size[0] > shape[0] or block_size[1] > shape[1]:
            raise ValueError("Block size is larger than the array")

        blocks = []
        n_blocks = int(ceil(shape[1] / block_size[1]))

        for i in range(0, shape[0], block_size[0]):
            read_count = min(block_size[0], shape[0] - i)
            read_size = int(read_count * shape[1] * dtype.itemsize)
            data = fid.read(read_size)
            out_blocks = [object() for _ in range(n_blocks)]
            _read_from_buffer(data, dtype, shape[1], block_size[1], out_blocks)
            blocks.append(out_blocks)

        return Array(blocks=blocks,
                     top_left_shape=block_size,
                     reg_shape=block_size,
                     shape=shape,
                     sparse=False)
    finally:
        fid.close()
示例#7
0
def load_hstack_npy_files(path, cols_per_block=None):
    """ Loads the .npy files in a directory into a ds-array, stacking them
    horizontally, like (A|B|C). The order of concatenation is alphanumeric.

    At least 1 valid .npy file must exist in the directory, and every .npy file
    must contain a valid array. Every array must have the same dtype, order,
    and number of rows.

    The blocks of the returned ds-array will have the same number of rows as
    the input arrays, and cols_per_block columns, which defaults to the number
    of columns of the first array.

    Parameters
    ----------
    path : string
        Folder path.
    cols_per_block : tuple (int, int)
        Number of columns of the blocks for the output ds-array. If None, the
        number of columns of the first array is used.

    Returns
    -------
    x : ds-array
        A distributed representation (ds-array) of the stacked arrays.
    """
    dirlist = os.listdir(path)
    folder_paths = [os.path.join(path, name) for name in sorted(dirlist)]
    # Full path of .npy files in the folder
    files = [
        pth for pth in folder_paths
        if os.path.isfile(pth) and pth[-4:] == '.npy'
    ]
    # Read the header of the first file to get shape, order, and dtype
    with open(files[0], "rb") as fid:
        version = format.read_magic(fid)
        format._check_version(version)
        shape0, order0, dtype0 = format._read_array_header(fid, version)
    rows = shape0[0]
    if cols_per_block is None:
        cols_per_block = shape0[1]
    # Check that all files have the same number of rows, order and datatype,
    # and store the number of columns for each file.
    files_cols = [shape0[1]]
    for filename in files[1:]:
        with open(filename, "rb") as fid:
            version = format.read_magic(fid)
            format._check_version(version)
            shape, order, dtype = format._read_array_header(fid, version)
            if shape[0] != shape0[0] or order0 != order or dtype0 != dtype:
                raise AssertionError()
            files_cols.append(shape[1])

    # Compute the parameters block_files, start_col and end_col for each block,
    # and call the task _load_hstack_npy_block() to generate each block.
    blocks = []
    file_idx = 0
    start_col = 0
    while file_idx < len(files):
        block_files = [files[file_idx]]
        cols = files_cols[file_idx] - start_col
        while cols < cols_per_block:  # while block not completed
            if file_idx + 1 == len(files):  # last file
                break
            file_idx += 1
            block_files.append(files[file_idx])
            cols += files_cols[file_idx]
        # Compute end_col of last file in block (last block may be smaller)
        end_col = files_cols[file_idx] - max(0, (cols - cols_per_block))
        blocks.append(_load_hstack_npy_block(block_files, start_col, end_col))
        if end_col == files_cols[file_idx]:  # file completed
            file_idx += 1
            start_col = 0
        else:  # file uncompleted
            start_col = end_col

    return Array(blocks=[blocks],
                 top_left_shape=(rows, cols_per_block),
                 reg_shape=(rows, cols_per_block),
                 shape=(rows, sum(files_cols)),
                 sparse=False)