def test_append_mode(tmpdir): path = tmpdir.mkdir("test") a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark(path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath) kaldiio.save_ark( path.join("a2.ark").strpath, {"Ï,é,à": a}, scp=path.join("b2.scp").strpath, append=True, ) kaldiio.save_ark( path.join("a2.ark").strpath, {"あいうえお": b}, scp=path.join("b2.scp").strpath, append=True, ) d1 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath)} d2 = {k: v for k, v in kaldiio.load_scp(path.join("b.scp").strpath).items()} d3 = {k: v for k, v in kaldiio.load_ark(path.join("a2.ark").strpath)} d4 = {k: v for k, v in kaldiio.load_scp(path.join("b2.scp").strpath).items()} _compare(d1, origin) _compare(d2, origin) _compare(d3, origin) _compare(d4, origin)
def __init__(self, spkrs_sz=-1, shuffle=True, wnd_size=170, utter_start=0, apply_vad=True, cache=0): # data path if hp.training: self.path = hp.data.train_path self.utter_num = hp.train.M else: self.path = hp.data.test_path self.utter_num = hp.test.M self.shuffle = shuffle self.wnd_size = wnd_size # (140, 180) self.utter_start = utter_start self.apply_vad = apply_vad depends = [os.path.join(self.path, x) for x in ['feats.scp', 'spk2utt']] self.feat_reader = kaldiio.load_scp(depends[0]) self.spk2utt = kio.Reader(depends[1], num_tokens=-1) self.speakers = self.spk2utt.index_keys if spkrs_sz > 0: self.speakers = self.speakers[:spkrs_sz] if self.apply_vad: vadf = os.path.join(self.path, 'vad.scp') self.vadscp = kaldiio.load_scp(vadf) print('[INFO] applying VAD: %s' % vadf) # TODO self._remove_silent_utter() else: print('[INFO] do not apply VAD, expect non silent files are fed')
def test_append_mode(tmpdir): path = tmpdir.mkdir('test') a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {'a': a, 'b': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath) kaldiio.save_ark(path.join('a2.ark').strpath, {'a': a}, scp=path.join('b2.scp').strpath, append=True) kaldiio.save_ark(path.join('a2.ark').strpath, {'b': b}, scp=path.join('b2.scp').strpath, append=True) d1 = {k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath)} d2 = { k: v for k, v in kaldiio.load_scp(path.join('b.scp').strpath).items() } d3 = {k: v for k, v in kaldiio.load_ark(path.join('a2.ark').strpath)} d4 = { k: v for k, v in kaldiio.load_scp(path.join('b2.scp').strpath).items() } _compare(d1, origin) _compare(d2, origin) _compare(d3, origin) _compare(d4, origin)
def preprocess_data(self, file_dir, apply_sort_filter=True): """ Generate a list of tuples (feat_key, speaker). """ logging.info("Loading kaldi-format feats.scp, labels.scp and utt2spk (optional) from {}".format(file_dir)) self.kaldi_io_feats = kaldiio.load_scp(os.path.join(file_dir, "feats.scp")) self.kaldi_io_labels = kaldiio.load_scp(os.path.join(file_dir, "labels.scp")) # data checking if self.kaldi_io_feats.keys() != self.kaldi_io_labels.keys(): logging.info("Error: feats.scp and labels.scp does not contain same keys, please check your data.") sys.exit() # initialize all speakers with 'global' unless 'utterance_key speaker' is specified in "utt2spk" self.speakers = dict.fromkeys(self.kaldi_io_feats.keys(), 'global') if os.path.exists(os.path.join(file_dir, "utt2spk")): with open(os.path.join(file_dir, "utt2spk"), "r") as f: lines = f.readlines() for line in lines: key, spk = line.strip().split(" ", 1) self.speakers[key] = spk self.entries = [] for key in self.kaldi_io_feats.keys(): self.entries.append(tuple([key, self.speakers[key]])) if apply_sort_filter: logging.info("Sorting and filtering data, this is very slow, please be patient ...") self.entries.sort(key=lambda item: self.kaldi_io_feats[item[0]].shape[0]) self.filter_sample_by_unk() self.filter_sample_by_input_length() self.filter_sample_by_output_length() return self
def __init__(self, model_path, trials_path, scp_path, batch_size=1, use_gpu=False, mgr=None, processes=None, check=False): if processes is not None and processes < 0: raise Exception('`processes` should be a positive integer.') self.check = check self.model_path = model_path self.scp_path = scp_path self.batch_size = batch_size self.use_gpu = use_gpu self.processes = int(processes) if processes is not None else None self.trials = [] self.data = load_scp(scp_path) self.shared_scores = mgr.dict() self.miss = 0 with open(trials_path, 'r') as f: for line in f: tmp = line.strip().split() if len(tmp) != 3: continue else: utt1, utt2, is_target = tmp self.trials.append((utt1, utt2, is_target)) self.tot = len(self.trials)
def scp2array_dic(scp_path, array_dic=None, ark_path=None, compression_method=None, append=False): """ read array_dic from ark indexed by scp or write array_dic to ark while create scp to index :param scp_path: filepath of scp :param array_dic: dic of array :param ark_path: filepath of ark, default is scppath.replace('.scp', '.ark') :param compression_method: compression method, default=None, kAutomaticMethod=1, kSpeechFeature=2, kTwoByteAuto=3,kTwoByteSignedInteger=4, kOneByteAuto=5, kOneByteUnsignedInteger=6, kOneByteZeroOne=7 :param append: if True, append, else write :return: dic of numpy array for read while None for write """ if array_dic is None: array_dic = kaldiio.load_scp(scp_path) return array_dic else: if ark_path is None: ark_path = scp_path.replace(".scp", ".ark") else: pass kaldiio.save_ark( ark=ark_path, array_dict=array_dic, scp=scp_path, compression_method=compression_method, append=append, ) return None
def test_write_read_multiark(tmpdir, endian, dtype): path = tmpdir.mkdir('test') a = np.random.rand(1000, 120).astype(dtype) b = np.random.rand(10, 120).astype(dtype) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, endian=endian) c = np.random.rand(1000, 120).astype(dtype) d = np.random.rand(10, 120).astype(dtype) origin.update({u'c': c, u'd': d}) with io.open(path.join('b.scp').strpath, 'a', encoding='utf-8') as f: kaldiio.save_ark(path.join('b.ark').strpath, origin, scp=f, endian=endian) d5 = { k: v for k, v in kaldiio.load_scp(path.join('b.scp').strpath, endian=endian).items() } _compare(d5, origin)
def preprocess_data(self, file_path, apply_sort_filter=True): """ Generate a list of tuples (feat_key, speaker). """ logging.info("Loading kaldi-format feats.scp " + \ "and utt2spk (optional) from {}".format(file_path)) self.kaldi_io_feats = kaldiio.load_scp(os.path.join(file_path, "feats.scp")) # initialize all speakers with 'global' # unless 'utterance_key speaker' is specified in "utt2spk" self.speakers = dict.fromkeys(self.kaldi_io_feats.keys(), 'global') if os.path.exists(os.path.join(file_path, "utt2spk")): with open(os.path.join(file_path, "utt2spk"), "r") as f: lines = f.readlines() for line in lines: key, spk = line.strip().split(" ", 1) self.speakers[key] = spk self.entries = [] for key in self.kaldi_io_feats.keys(): self.entries.append(tuple([key, self.speakers[key]])) if apply_sort_filter: logging.info("Sorting and filtering data, this is very slow, please be patient ...") self.entries.sort(key=lambda item: self.kaldi_io_feats[item[0]].shape[0]) # filter length of frames self.entries = list(filter(lambda x: self.kaldi_io_feats[x[0]].shape[0] in range(self.hparams.input_length_range[0], self.hparams.input_length_range[1]), self.entries)) return self
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') logging.info("CUDA_VISIBLE_DEVICES=" + os.environ.get("CUDA_VISIBLE_DEVICES", "")) logging.info("HOST=" + os.environ.get("HOST", "")) logging.info("SLURM_JOB_ID=" + os.environ.get("SLURM_JOB_ID", "")) model_dir = Path(args.model_dir) forward_dir = model_dir / "forward" aux_scp = kaldiio.load_scp(args.aux_scp) model = torch.load(model_dir / "model.pickle", map_location="cpu") model.eval() with torch.no_grad(), open(args.forward_ark, "wb") as f: for key, feat in kaldi_io.read_mat_ark(args.input_rs): aux = torch.from_numpy(aux_scp[key]) logging.info("input: key={} feat={} aux={}".format(key, feat.shape, aux.shape)) # feat is (time, freq) shape x = torch.from_numpy(feat.T).unsqueeze(0) if x.shape[2] < args.min_time_width: remain = args.min_time_width - x.shape[2] + 1 lpad = torch.zeros(1, x.shape[1], remain / 2) rpad = torch.zeros(1, x.shape[1], remain / 2) x = torch.cat((lpad, x, rpad), dim=2) n_aux = aux.shape[0] # take center ivector frame if args.use_last_ivector: aux = aux[-1].unsqueeze(0) else: aux = aux[n_aux//2].unsqueeze(0) # forward y, _ = model(x, aux) y = torch.nn.functional.log_softmax(y, dim=1).squeeze(0) logging.info("output: {}".format(y.shape)) kaldi_io.write_mat(f, y.numpy().T, key)
def __init__(self, feats_file, labels_file=None, root_dir=None, transform=None, target_transform=None, max_timestep=1024, max_label_len=128): """ Args: feats_file (string): labels_file (string): root_dir (string): """ self.feats = kaldiio.load_scp(os.path.join(root_dir, feats_file)) if labels_file is not None: self.labels = self._load_text(os.path.join(root_dir, labels_file)) else: self.labels = None self.utts = list(self.feats.keys()) self.transform = transform self.target_transform = target_transform self.max_timestep = max_timestep self.max_label_len = max_label_len # These will contain indices over the original dataset. The indices of # the safe samples will go into _safe_indices and similarly for unsafe # samples. self._safe_indices = [] self._unsafe_indices = []
def __init__(self, utt_path, tri_path, scp_path, anchor_size, buffer_size, use_gpu=True): self.datas = load_scp(scp_path) self.anchor_size = anchor_size self.utts = [] self.use_gpu = use_gpu self.tot_line_num = -1 self.tot_utts = -1 self.buffer_size = buffer_size with open(utt_path, 'r') as up: for self.tot_utts, line in enumerate(up): pass self.tot_utts += 1 self.buffer = {} self.utt_handler = open(utt_path, 'r') with open(tri_path, 'r') as tp: for self.tot_line_num, line in enumerate(tp): pass self.tot_line_num += 1 self.tris_handler = open(tri_path, 'r') self.tris = []
def test_write_read_int32_vector(tmpdir, endian): path = tmpdir.mkdir('test') a = np.random.randint(1, 128, 10, dtype=np.int32) b = np.random.randint(1, 128, 10, dtype=np.int32) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, endian=endian) d2 = { k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian) } d5 = { k: v for k, v in kaldiio.load_scp(path.join('b.scp').strpath, endian=endian).items() } with io.open(path.join('a.ark').strpath, 'rb') as fd: d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)} _compare(d2, origin) _compare(d5, origin) _compare(d6, origin)
def test_write_read(tmpdir, shape1, shape2, endian, dtype, max_cache_fd): path = tmpdir.mkdir("test") a = np.random.rand(*shape1).astype(dtype) b = np.random.rand(*shape2).astype(dtype) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark( path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath, endian=endian, ) d2 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath, endian=endian)} d5 = { k: v for k, v in kaldiio.load_scp( path.join("b.scp").strpath, endian=endian, max_cache_fd=max_cache_fd ).items() } with io.open(path.join("a.ark").strpath, "rb") as fd: d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)} _compare(d2, origin) _compare(d5, origin) _compare(d6, origin)
def _get_feats_scp_loader(feats_scp): # read the first line of feats.scp file with open(feats_scp) as f: key, value = f.readlines()[0].replace("\n", "").split() # check scp type if ":" in value: value_1, value_2 = value.split(":") if value_1.endswith(".ark"): # kaldi-ark case: utt_id_1 /path/to/utt_id_1.ark:index return kaldiio.load_scp(feats_scp) elif value_1.endswith(".h5"): # hdf5 case with path in hdf5: utt_id_1 /path/to/utt_id_1.h5:feats return HDF5ScpLoader(feats_scp) else: raise ValueError("Not supported feats.scp type.") else: if value.endswith(".h5"): # hdf5 case without path in hdf5: utt_id_1 /path/to/utt_id_1.h5 return HDF5ScpLoader(feats_scp) elif value.endswith(".npy"): # npy case: utt_id_1 /path/to/utt_id_1.npy return NpyScpLoader(feats_scp) else: raise ValueError("Not supported feats.scp type.")
def test_write_read_multiark(tmpdir, endian, dtype): path = tmpdir.mkdir("test") a = np.random.rand(1000, 120).astype(dtype) b = np.random.rand(10, 120).astype(dtype) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark( path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath, endian=endian, ) c = np.random.rand(1000, 120).astype(dtype) d = np.random.rand(10, 120).astype(dtype) origin.update({"c": c, "d": d}) with io.open(path.join("b.scp").strpath, "a", encoding="utf-8") as f: kaldiio.save_ark(path.join("b.ark").strpath, origin, scp=f, endian=endian) d5 = { k: v for k, v in kaldiio.load_scp(path.join("b.scp").strpath, endian=endian).items() } _compare(d5, origin)
def test_write_read_compress(tmpdir, compression_method, endian): path = tmpdir.mkdir('test') a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, compression_method=compression_method, endian=endian) d2 = { k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian) } d5 = { k: v for k, v in kaldiio.load_scp(path.join('b.scp').strpath, endian=endian).items() } with io.open(path.join('a.ark').strpath, 'rb') as fd: d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)} _compare_allclose(d2, origin, atol=1e-1) _compare_allclose(d5, origin, atol=1e-1) _compare_allclose(d6, origin, atol=1e-1)
def test_write_read(tmpdir, shape1, shape2, endian, dtype): path = tmpdir.mkdir('test') a = np.random.rand(*shape1).astype(dtype) b = np.random.rand(*shape2).astype(dtype) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, endian=endian) d2 = { k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian) } d5 = { k: v for k, v in kaldiio.load_scp(path.join('b.scp').strpath, endian=endian).items() } with io.open(path.join('a.ark').strpath, 'rb') as fd: d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)} _compare(d2, origin) _compare(d5, origin) _compare(d6, origin)
def __init__(self, wav_scp_path, text_path, tgt_dict, shuffle=False): super().__init__() self._sample_rate = None self.wav_dict = kaldiio.load_scp(wav_scp_path) self.text_dict, self.sizes = load_text(text_path, tgt_dict) self.id2key = list(self.text_dict.keys()) self.tgt_dict = tgt_dict self.shuffle = shuffle
def __init__(self, config=None): super().__init__(config=config) self.seg_wav = kaldiio.load_scp(self.hparams.wav_scp, segments=self.hparams.seg_file) if self.hparams.spectral_augmentation is not None: self.spectral_aug = SpecAugment(self.hparams.spectral_augmentation) else: self.spectral_aug = None
def __init__(self, storage_path: Pathlike, *args, **kwargs): if not is_module_available('kaldiio'): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldiio' first.") import kaldiio super().__init__() self.storage_path = storage_path self.storage = kaldiio.load_scp(str(self.storage_path))
def _read_scp(self, input_file): loader = kaldiio.load_scp(input_file) feats = {} for k in loader: feats[k] = loader[k] return feats
def pipe_wav_loader(path, float_dtype): # The file is as follows: # utterance_id_A cat a.wav | # utterance_id_B cat b.wav | # NOTE(kamo): I don't think this case is practical # because subprocess takes much times due to fork(). # NOTE(kamo): kaldiio doesn't normalize the signal. loader = kaldiio.load_scp(path) return AdapterForSoundScpReader(loader, float_dtype)
def test_write_read_ascii(tmpdir): path = tmpdir.mkdir("test") a = np.random.rand(10, 10).astype(np.float32) b = np.random.rand(5, 35).astype(np.float32) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark( path.join("a.ark").strpath, origin, scp=path.join("a.scp").strpath, text=True ) d2 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath)} d5 = {k: v for k, v in kaldiio.load_scp(path.join("a.scp").strpath).items()} _compare_allclose(d2, origin) _compare_allclose(d5, origin)
def __init__(self, data_dir, wnd_size=170, rnd_chunks=True, apply_vad=True, cache=0): # data path self.path = data_dir self.wnd_size = wnd_size self.hop_size = wnd_size // 2 self.rnd_chunks = rnd_chunks self.apply_vad = apply_vad self.num_chunks = 128 # int(self._average_vad() // self.hop_size) print('[INFO] number of windows sampled from every file: %d' % self.num_chunks) depends = [os.path.join(self.path, x) for x in ['feats.scp', 'vad.scp']] self.feat_reader = kaldiio.load_scp(depends[0]) self.utter_list = list(self.feat_reader.keys()) if self.apply_vad: vadf = os.path.join(self.path, 'vad.scp') self.vadscp = kaldiio.load_scp(vadf) print('[INFO] applying VAD: %s' % vadf) self._remove_silent_utter() else: print('[INFO] do not apply VAD, expect non silent files are fed')
def load_from_scp(self, options, filename=None): scpfile = filename or self.expdir / "feats.scp" features = dict(kaldiio.load_scp(str(scpfile))) labels = { uttid: self.coder.encode(read_task(task)) for spkr in self.dataconf.sections() for uttid, task in zip( *self.load_tasks(Path(self.dataconf[spkr].get("tasks")))) } features, labels = self.check_errors(features, labels, options.errors) return self.make_splits(features, labels, options)
def __init__( self, wav_scp, segments=None, audio_length_threshold=None, return_utt_id=False, return_sampling_rate=False, allow_cache=False, ): """Initialize dataset. Args: wav_scp (str): Kaldi-style wav.scp file. feats_scp (str): Kaldi-style fests.scp file. segments (str): Kaldi-style segments file. audio_length_threshold (int): Threshold to remove short audio files. return_utt_id (bool): Whether to return utterance id. return_sampling_rate (bool): Wheter to return sampling rate. allow_cache (bool): Whether to allow cache of the loaded files. """ # load scp as lazy dict audio_loader = kaldiio.load_scp(wav_scp, segments=segments) audio_keys = list(audio_loader.keys()) # filter by threshold if audio_length_threshold is not None: audio_lengths = [ audio.shape[0] for _, audio in audio_loader.values() ] idxs = [ idx for idx in range(len(audio_keys)) if audio_lengths[idx] > audio_length_threshold ] if len(audio_keys) != len(idxs): logging.warning( f"Some files are filtered by audio length threshold " f"({len(audio_keys)} -> {len(idxs)}).") audio_keys = [audio_keys[idx] for idx in idxs] self.audio_loader = audio_loader self.utt_ids = audio_keys self.return_utt_id = return_utt_id self.return_sampling_rate = return_sampling_rate self.allow_cache = allow_cache if allow_cache: # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 self.manager = Manager() self.caches = self.manager.list() self.caches += [() for _ in range(len(self.utt_ids))]
def __init__(self, scp_path, trials, use_gpu=False, pre_load=False): super(KaldiTester, self).__init__() self.datas = load_scp(scp_path) self.tot = len(self.datas) self.use_gpu = use_gpu self.trials = trials self.miss = 0 self.pre_load = pre_load if self.pre_load: self.data = {} for key in self.datas: self.data[key] = torch.from_numpy(self.datas.get(key)) if self.use_gpu: self.data[key] = self.data[key].cuda()
def process(self, ): label_dict = self.get_label_dict() data = kaldiio.load_scp(self.feat_scp) with open(self.manifest, 'w') as fid: for key in data: x = data[key] x = x - self.mean x = x * self.istd label = Label2Indx[label_dict[key]] datum = {'feats': x, 'label': label} save_path = self.store_folder + '/' + str( uuid.uuid1()) + '__' + key + '.npy' np.save(save_path, datum) fid.write(save_path + '\n')
def __init__(self, scp_path, trial_path, use_gpu=True): self.datas = load_scp(scp_path) self.trials = [] self.utt_buffer = {} tmp_set = set() with open(trial_path, 'r') as f: for line in f: utt1, utt2, label = line.strip().split() tmp_set.add(utt1) tmp_set.add(utt2) self.trials.append((utt1, utt2, int(label))) for utt in tmp_set: xvec = torch.from_numpy(self.datas.get(utt)).float() self.utt_buffer[utt] = xvec.cuda() if use_gpu else xvec
def test_write_load_ascii(tmpdir): path = tmpdir.mkdir('test') a = np.random.rand(10, 10).astype(np.float32) b = np.random.rand(5, 35).astype(np.float32) origin = {'a': a, 'b': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('a.scp').strpath, text=True) d2 = {k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath)} d5 = { k: v for k, v in kaldiio.load_scp(path.join('a.scp').strpath).items() } _compare_allclose(d2, origin) _compare_allclose(d5, origin)