def test_ESPnetDataset_sound_scp(sound_scp): dataset = ESPnetDataset( path_name_type_list=[(sound_scp, "data1", "sound")], preprocess=preprocess, ) print(dataset) print(dataset.names()) _, data = dataset["a"] assert data["data1"].shape == (160000, ) _, data = dataset["b"] assert data["data1"].shape == (80000, )
def test_ESPnetDataset_pipe_wav(pipe_wav): dataset = ESPnetDataset( path_name_type_list=[(pipe_wav, "data1", "pipe_wav")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data1"].shape == (160000,) _, data = dataset["b"] assert data["data1"].shape == (80000,)
def test_ESPnetDataset_feats_scp(feats_scp,): dataset = ESPnetDataset( path_name_type_list=[(feats_scp, "data2", "kaldi_ark")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data2"].shape == (100, 80,) _, data = dataset["b"] assert data["data2"].shape == (150, 80,)
def test_ESPnetDataset_csv_float(csv_float): dataset = ESPnetDataset( path_name_type_list=[(csv_float, "data8", "csv_float")], preprocess=preprocess, ) _, data = dataset["a"] assert all((data["data8"]) == np.array([1.4, 3.4], dtype=np.float32)) _, data = dataset["b"] assert all((data["data8"]) == np.array([0.9, 9.3], dtype=np.float32))
def test_ESPnetDataset_csv_int(csv_int): dataset = ESPnetDataset( path_name_type_list=[(csv_int, "data8", "csv_int")], preprocess=preprocess, ) _, data = dataset["a"] assert tuple(data["data8"]) == (0, 1, 2) _, data = dataset["b"] assert tuple(data["data8"]) == (2, 3, 4)
def test_ESPnetDataset_text(text): dataset = ESPnetDataset( path_name_type_list=[(text, "data7", "text")], preprocess=preprocess, ) _, data = dataset["a"] assert tuple(data["data7"]) == (0,) _, data = dataset["b"] assert tuple(data["data7"]) == (1,)
def test_ESPnetDataset_h5file_1(h5file_1): dataset = ESPnetDataset( path_name_type_list=[(h5file_1, "data4", "hdf5")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data4"].shape == (100, 80,) _, data = dataset["b"] assert data["data4"].shape == (150, 80,)
def test_ESPnetDataset_npy_scp(npy_scp): dataset = ESPnetDataset( path_name_type_list=[(npy_scp, "data3", "npy")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data3"].shape == (100, 80,) _, data = dataset["b"] assert data["data3"].shape == (150, 80,)
def test_ESPnetDataset_rand_int(shape_file): dataset = ESPnetDataset( path_name_type_list=[(shape_file, "data6", "rand_int_0_10")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data6"].shape == (100, 80,) _, data = dataset["b"] assert data["data6"].shape == (150, 80,)
def test_ESPnetDataset_h5file_2(h5file_2): dataset = ESPnetDataset( path_name_type_list=[(h5file_2, "data1", "hdf5")], preprocess=preprocess, ) _, data = dataset["a"] assert data["data1_input"].shape == (100, 80) assert data["data1_target"].shape == (10,) _, data = dataset["b"] assert data["data1_input"].shape == (150, 80) assert data["data1_target"].shape == (13,)
def __init__( self, path_name_type_list: Collection[Tuple[str, str, str]], preprocess: Callable[ [str, Dict[str, np.ndarray]], Dict[str, np.ndarray] ] = None, float_dtype: str = "float32", int_dtype: str = "long", key_file: str = None, ): assert check_argument_types() if len(path_name_type_list) == 0: raise ValueError( '1 or more elements are required for "path_name_type_list"' ) path_name_type_list = copy.deepcopy(path_name_type_list) self.preprocess = preprocess self.float_dtype = float_dtype self.int_dtype = int_dtype self.key_file = key_file self.debug_info = {} non_iterable_list = [] self.path_name_type_list = [] for path, name, _type in path_name_type_list: if name in self.debug_info: raise RuntimeError(f'"{name}" is duplicated for data-key') self.debug_info[name] = path, _type if _type not in DATA_TYPES: non_iterable_list.append((path, name, _type)) else: self.path_name_type_list.append((path, name, _type)) if len(non_iterable_list) != 0: # Some types doesn't support iterable mode self.non_iterable_dataset = ESPnetDataset( path_name_type_list=non_iterable_list, preprocess=preprocess, float_dtype=float_dtype, int_dtype=int_dtype, ) else: self.non_iterable_dataset = None if Path(Path(path_name_type_list[0][0]).parent, "utt2category").exists(): self.apply_utt2category = True else: self.apply_utt2category = False
def test_ESPnetDataset_imagefolder(imagefolder): pytest.importorskip("torchvision") dataset = ESPnetDataset( path_name_type_list=[(imagefolder, "data1", "imagefolder_32x32")], preprocess=preprocess, ) _, data = dataset[0] assert data["data1_0"].shape == (3, 32, 32) assert data["data1_1"] == (0, ) _, data = dataset[1] assert data["data1_0"].shape == (3, 32, 32) assert data["data1_1"] == (1, )