def read_matrix_or_vector(fd, endian='<', return_size=False): """Call from load_kaldi_file Args: fd (file): endian (str): return_size (bool): """ size = 0 assert fd.read(2) == b'\0B' size += 2 Type = str(read_token(fd)) size += len(Type) + 1 # CompressedMatrix if 'CM' == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size per_col_header = PerColHeader.read(fd, global_header) size += per_col_header.size # Read data buf = fd.read(global_header.rows * global_header.cols) size += global_header.rows * global_header.cols array = np.frombuffer(buf, dtype=np.dtype(endian + 'u1')) array = array.reshape((global_header.cols, global_header.rows)) # Decompress array = per_col_header.char_to_float(array) array = array.T elif 'CM2' == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size # Read matrix buf = fd.read(2 * global_header.rows * global_header.cols) array = np.frombuffer(buf, dtype=np.dtype(endian + 'u2')) array = array.reshape((global_header.rows, global_header.cols)) # Decompress array = global_header.uint_to_float(array) elif 'CM3' == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size # Read matrix buf = fd.read(global_header.rows * global_header.cols) array = np.frombuffer(buf, dtype=np.dtype(endian + 'u1')) array = array.reshape((global_header.rows, global_header.cols)) # Decompress array = global_header.uint_to_float(array) else: if Type == 'FM' or Type == 'FV': dtype = endian + 'f' bytes_per_sample = 4 elif Type == 'DM' or Type == 'DV': dtype = endian + 'd' bytes_per_sample = 8 else: raise ValueError('Unexpected format: "{}". Now FM, FV, DM, DV, ' 'CM, CM2, CM3 are supported.'.format(Type)) assert fd.read(1) == b'\4' size += 1 rows = struct.unpack(endian + 'i', fd.read(4))[0] size += 4 dim = rows if 'M' in Type: # As matrix assert fd.read(1) == b'\4' size += 1 cols = struct.unpack(endian + 'i', fd.read(4))[0] size += 4 dim = rows * cols buf = fd.read(dim * bytes_per_sample) size += dim * bytes_per_sample array = np.frombuffer(buf, dtype=np.dtype(dtype)) if 'M' in Type: # As matrix array = np.reshape(array, (rows, cols)) if return_size: return array, size else: return array
def write_array(fd, array, endian='<', compression_method=None): """Write array Args: fd (file): binary mode array (np.ndarray): endian (str): Returns: size (int): """ size = 0 assert isinstance(array, np.ndarray), type(array) fd.write(b'\0B') size += 2 if compression_method is not None: if array.ndim != 2: raise ValueError( 'array must be matrix if compression_method is not None: {}'. format(array.ndim)) global_header = GlobalHeader.compute(array, compression_method, endian) size += global_header.write(fd) if global_header.type == 'CM': per_col_header = PerColHeader.compute(array, global_header) size += per_col_header.write(fd, global_header) array = per_col_header.float_to_char(array.T) byte_string = array.tobytes() fd.write(byte_string) size += len(byte_string) elif global_header.type == 'CM2': array = global_header.float_to_uint(array) byte_string = array.tobytes() fd.write(byte_string) size += len(byte_string) elif global_header.type == 'CM3': array = global_header.float_to_uint(array) byte_string = array.tobytes() fd.write(byte_string) size += len(byte_string) elif array.dtype == np.int32: assert array.ndim == 1, array.ndim # Must be vector fd.write(b'\4') fd.write(struct.pack(endian + 'i', len(array))) for x in array: fd.write(b'\4') fd.write(struct.pack(endian + 'i', x)) size += (len(array) + 1) * 5 elif array.dtype == np.float32 or array.dtype == np.float64: assert 0 < len(array.shape) < 3 # Matrix or vector if len(array.shape) == 1: if array.dtype == np.float32: fd.write(b'FV ') size += 3 elif array.dtype == np.float64: fd.write(b'DV ') size += 3 fd.write(b'\4') size += 1 fd.write(struct.pack(endian + 'i', len(array))) size += 4 elif len(array.shape) == 2: if array.dtype == np.float32: fd.write(b'FM ') size += 3 elif array.dtype == np.float64: fd.write(b'DM ') size += 3 fd.write(b'\4') size += 1 fd.write(struct.pack(endian + 'i', len(array))) # Rows size += 4 fd.write(b'\4') size += 1 fd.write(struct.pack(endian + 'i', array.shape[1])) # Cols size += 4 if endian not in array.dtype.str: array = array.astype(array.dtype.newbyteorder()) fd.write(array.tobytes()) size += array.nbytes else: raise ValueError('Unsupported array type: {}'.format(array.dtype)) return size
def read_matrix_or_vector(fd, endian="<", return_size=False): """Call from load_kaldi_file Args: fd (file): endian (str): return_size (bool): """ size = 0 assert fd.read(2) == b"\0B" size += 2 Type = str(read_token(fd)) size += len(Type) + 1 # CompressedMatrix if "CM" == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size per_col_header = PerColHeader.read(fd, global_header) size += per_col_header.size # Read data buf = fd.read(global_header.rows * global_header.cols) size += global_header.rows * global_header.cols array = np.frombuffer(buf, dtype=np.dtype(endian + "u1")) array = array.reshape((global_header.cols, global_header.rows)) # Decompress array = per_col_header.char_to_float(array) array = array.T elif "CM2" == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size # Read matrix buf = fd.read(2 * global_header.rows * global_header.cols) array = np.frombuffer(buf, dtype=np.dtype(endian + "u2")) array = array.reshape((global_header.rows, global_header.cols)) # Decompress array = global_header.uint_to_float(array) elif "CM3" == Type: # Read GlobalHeader global_header = GlobalHeader.read(fd, Type, endian) size += global_header.size # Read matrix buf = fd.read(global_header.rows * global_header.cols) array = np.frombuffer(buf, dtype=np.dtype(endian + "u1")) array = array.reshape((global_header.rows, global_header.cols)) # Decompress array = global_header.uint_to_float(array) else: if Type == "FM" or Type == "FV": dtype = endian + "f" bytes_per_sample = 4 elif Type == "DM" or Type == "DV": dtype = endian + "d" bytes_per_sample = 8 else: raise ValueError('Unexpected format: "{}". Now FM, FV, DM, DV, ' "CM, CM2, CM3 are supported.".format(Type)) assert fd.read(1) == b"\4" size += 1 rows = struct.unpack(endian + "i", fd.read(4))[0] size += 4 dim = rows if "M" in Type: # As matrix assert fd.read(1) == b"\4" size += 1 cols = struct.unpack(endian + "i", fd.read(4))[0] size += 4 dim = rows * cols buf = fd.read(dim * bytes_per_sample) size += dim * bytes_per_sample array = np.frombuffer(buf, dtype=np.dtype(dtype)) if "M" in Type: # As matrix array = np.reshape(array, (rows, cols)) if return_size: return array, size else: return array