Пример #1
0
def test_append_mode(tmpdir):
    path = tmpdir.mkdir("test")

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {"Ï,é,à": a, "あいうえお": b}
    kaldiio.save_ark(path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath)

    kaldiio.save_ark(
        path.join("a2.ark").strpath,
        {"Ï,é,à": a},
        scp=path.join("b2.scp").strpath,
        append=True,
    )
    kaldiio.save_ark(
        path.join("a2.ark").strpath,
        {"あいうえお": b},
        scp=path.join("b2.scp").strpath,
        append=True,
    )
    d1 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath)}
    d2 = {k: v for k, v in kaldiio.load_scp(path.join("b.scp").strpath).items()}
    d3 = {k: v for k, v in kaldiio.load_ark(path.join("a2.ark").strpath)}
    d4 = {k: v for k, v in kaldiio.load_scp(path.join("b2.scp").strpath).items()}
    _compare(d1, origin)
    _compare(d2, origin)
    _compare(d3, origin)
    _compare(d4, origin)
    def __init__(self, spkrs_sz=-1, shuffle=True, wnd_size=170, utter_start=0, apply_vad=True, cache=0):
        # data path
        if hp.training:
            self.path = hp.data.train_path
            self.utter_num = hp.train.M
        else:
            self.path = hp.data.test_path
            self.utter_num = hp.test.M

        self.shuffle = shuffle
        self.wnd_size = wnd_size  # (140, 180)
        self.utter_start = utter_start
        self.apply_vad = apply_vad

        depends = [os.path.join(self.path, x) for x in ['feats.scp', 'spk2utt']]

        self.feat_reader = kaldiio.load_scp(depends[0])
        self.spk2utt = kio.Reader(depends[1], num_tokens=-1)
        self.speakers = self.spk2utt.index_keys

        if spkrs_sz > 0:
            self.speakers = self.speakers[:spkrs_sz]

        if self.apply_vad:
            vadf = os.path.join(self.path, 'vad.scp')
            self.vadscp = kaldiio.load_scp(vadf)
            print('[INFO] applying VAD: %s' % vadf)
            # TODO self._remove_silent_utter()
        else:
            print('[INFO] do not apply VAD, expect non silent files are fed')
Пример #3
0
def test_append_mode(tmpdir):
    path = tmpdir.mkdir('test')

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {'a': a, 'b': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath)

    kaldiio.save_ark(path.join('a2.ark').strpath, {'a': a},
                     scp=path.join('b2.scp').strpath,
                     append=True)
    kaldiio.save_ark(path.join('a2.ark').strpath, {'b': b},
                     scp=path.join('b2.scp').strpath,
                     append=True)
    d1 = {k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath)}
    d2 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b.scp').strpath).items()
    }
    d3 = {k: v for k, v in kaldiio.load_ark(path.join('a2.ark').strpath)}
    d4 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b2.scp').strpath).items()
    }
    _compare(d1, origin)
    _compare(d2, origin)
    _compare(d3, origin)
    _compare(d4, origin)
Пример #4
0
    def preprocess_data(self, file_dir, apply_sort_filter=True):
        """ Generate a list of tuples (feat_key, speaker). """
        logging.info("Loading kaldi-format feats.scp, labels.scp and utt2spk (optional) from {}".format(file_dir))
        self.kaldi_io_feats = kaldiio.load_scp(os.path.join(file_dir, "feats.scp"))
        self.kaldi_io_labels = kaldiio.load_scp(os.path.join(file_dir, "labels.scp"))

        # data checking
        if self.kaldi_io_feats.keys() != self.kaldi_io_labels.keys():
            logging.info("Error: feats.scp and labels.scp does not contain same keys, please check your data.")
            sys.exit()

        # initialize all speakers with 'global' unless 'utterance_key speaker' is specified in "utt2spk"
        self.speakers = dict.fromkeys(self.kaldi_io_feats.keys(), 'global')
        if os.path.exists(os.path.join(file_dir, "utt2spk")):
            with open(os.path.join(file_dir, "utt2spk"), "r") as f:
                lines = f.readlines()
                for line in lines:
                    key, spk = line.strip().split(" ", 1)
                    self.speakers[key] = spk

        self.entries = []
        for key in self.kaldi_io_feats.keys():
            self.entries.append(tuple([key, self.speakers[key]]))

        if apply_sort_filter:
            logging.info("Sorting and filtering data, this is very slow, please be patient ...")
            self.entries.sort(key=lambda item: self.kaldi_io_feats[item[0]].shape[0])
            self.filter_sample_by_unk()
            self.filter_sample_by_input_length()
            self.filter_sample_by_output_length()
        return self
Пример #5
0
    def __init__(self,
                 model_path,
                 trials_path,
                 scp_path,
                 batch_size=1,
                 use_gpu=False,
                 mgr=None,
                 processes=None,
                 check=False):
        if processes is not None and processes < 0:
            raise Exception('`processes` should be a positive integer.')
        self.check = check
        self.model_path = model_path
        self.scp_path = scp_path
        self.batch_size = batch_size
        self.use_gpu = use_gpu
        self.processes = int(processes) if processes is not None else None
        self.trials = []
        self.data = load_scp(scp_path)
        self.shared_scores = mgr.dict()
        self.miss = 0

        with open(trials_path, 'r') as f:
            for line in f:
                tmp = line.strip().split()
                if len(tmp) != 3:
                    continue
                else:
                    utt1, utt2, is_target = tmp
                    self.trials.append((utt1, utt2, is_target))
        self.tot = len(self.trials)
Пример #6
0
def scp2array_dic(scp_path,
                  array_dic=None,
                  ark_path=None,
                  compression_method=None,
                  append=False):
    """
    read array_dic from ark indexed by scp or
    write array_dic to ark while create scp to index
    :param scp_path: filepath of scp
    :param array_dic: dic of array
    :param ark_path: filepath of ark, default is scppath.replace('.scp', '.ark')
    :param compression_method: compression method, default=None,
                kAutomaticMethod=1, kSpeechFeature=2,
                kTwoByteAuto=3,kTwoByteSignedInteger=4, kOneByteAuto=5,
                kOneByteUnsignedInteger=6, kOneByteZeroOne=7
    :param append: if True, append, else write
    :return: dic of numpy array for read while None for write
    """
    if array_dic is None:
        array_dic = kaldiio.load_scp(scp_path)
        return array_dic
    else:
        if ark_path is None:
            ark_path = scp_path.replace(".scp", ".ark")
        else:
            pass
        kaldiio.save_ark(
            ark=ark_path,
            array_dict=array_dic,
            scp=scp_path,
            compression_method=compression_method,
            append=append,
        )
        return None
Пример #7
0
def test_write_read_multiark(tmpdir, endian, dtype):
    path = tmpdir.mkdir('test')

    a = np.random.rand(1000, 120).astype(dtype)
    b = np.random.rand(10, 120).astype(dtype)
    origin = {u'Ï,é,à': a, u'あいうえお': b}

    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     endian=endian)

    c = np.random.rand(1000, 120).astype(dtype)
    d = np.random.rand(10, 120).astype(dtype)
    origin.update({u'c': c, u'd': d})
    with io.open(path.join('b.scp').strpath, 'a', encoding='utf-8') as f:
        kaldiio.save_ark(path.join('b.ark').strpath,
                         origin,
                         scp=f,
                         endian=endian)

    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b.scp').strpath,
                                     endian=endian).items()
    }
    _compare(d5, origin)
Пример #8
0
    def preprocess_data(self, file_path, apply_sort_filter=True):
        """ Generate a list of tuples (feat_key, speaker). """
        logging.info("Loading kaldi-format feats.scp " + \
            "and utt2spk (optional) from {}".format(file_path))
        self.kaldi_io_feats = kaldiio.load_scp(os.path.join(file_path, "feats.scp"))

        # initialize all speakers with 'global'
        # unless 'utterance_key speaker' is specified in "utt2spk"
        self.speakers = dict.fromkeys(self.kaldi_io_feats.keys(), 'global')
        if os.path.exists(os.path.join(file_path, "utt2spk")):
            with open(os.path.join(file_path, "utt2spk"), "r") as f:
                lines = f.readlines()
                for line in lines:
                    key, spk = line.strip().split(" ", 1)
                    self.speakers[key] = spk

        self.entries = []
        for key in self.kaldi_io_feats.keys():
            self.entries.append(tuple([key, self.speakers[key]]))

        if apply_sort_filter:
            logging.info("Sorting and filtering data, this is very slow, please be patient ...")
            self.entries.sort(key=lambda item: self.kaldi_io_feats[item[0]].shape[0])
            # filter length of frames
            self.entries = list(filter(lambda x: self.kaldi_io_feats[x[0]].shape[0] in
                                range(self.hparams.input_length_range[0],
                                self.hparams.input_length_range[1]), self.entries))
        return self
Пример #9
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
    logging.info("CUDA_VISIBLE_DEVICES=" + os.environ.get("CUDA_VISIBLE_DEVICES", ""))
    logging.info("HOST=" + os.environ.get("HOST", ""))
    logging.info("SLURM_JOB_ID=" + os.environ.get("SLURM_JOB_ID", ""))

    model_dir = Path(args.model_dir)
    forward_dir = model_dir / "forward"
    aux_scp = kaldiio.load_scp(args.aux_scp)
    model = torch.load(model_dir / "model.pickle", map_location="cpu")
    model.eval()
    with torch.no_grad(), open(args.forward_ark, "wb") as f:
        for key, feat in kaldi_io.read_mat_ark(args.input_rs):
            aux = torch.from_numpy(aux_scp[key])
            logging.info("input: key={} feat={} aux={}".format(key, feat.shape, aux.shape))
            # feat is (time, freq) shape
            x = torch.from_numpy(feat.T).unsqueeze(0)
            if x.shape[2] < args.min_time_width:
                remain = args.min_time_width - x.shape[2] + 1
                lpad = torch.zeros(1, x.shape[1], remain / 2)
                rpad = torch.zeros(1, x.shape[1], remain / 2)
                x = torch.cat((lpad, x, rpad), dim=2)

            n_aux = aux.shape[0]
            # take center ivector frame
            if args.use_last_ivector:
                aux = aux[-1].unsqueeze(0)
            else:
                aux = aux[n_aux//2].unsqueeze(0)
            # forward
            y, _ = model(x, aux)
            y = torch.nn.functional.log_softmax(y, dim=1).squeeze(0)
            logging.info("output: {}".format(y.shape))
            kaldi_io.write_mat(f, y.numpy().T, key)
Пример #10
0
 def __init__(self,
              feats_file,
              labels_file=None,
              root_dir=None,
              transform=None,
              target_transform=None,
              max_timestep=1024,
              max_label_len=128):
     """
     Args:
         feats_file (string): 
         labels_file (string):
         root_dir (string):
     """
     self.feats = kaldiio.load_scp(os.path.join(root_dir, feats_file))
     if labels_file is not None:
         self.labels = self._load_text(os.path.join(root_dir, labels_file))
     else:
         self.labels = None
     self.utts = list(self.feats.keys())
     self.transform = transform
     self.target_transform = target_transform
     self.max_timestep = max_timestep
     self.max_label_len = max_label_len
     # These will contain indices over the original dataset. The indices of
     # the safe samples will go into _safe_indices and similarly for unsafe
     # samples.
     self._safe_indices = []
     self._unsafe_indices = []
Пример #11
0
    def __init__(self,
                 utt_path,
                 tri_path,
                 scp_path,
                 anchor_size,
                 buffer_size,
                 use_gpu=True):
        self.datas = load_scp(scp_path)
        self.anchor_size = anchor_size
        self.utts = []
        self.use_gpu = use_gpu
        self.tot_line_num = -1
        self.tot_utts = -1
        self.buffer_size = buffer_size

        with open(utt_path, 'r') as up:
            for self.tot_utts, line in enumerate(up):
                pass
            self.tot_utts += 1
        self.buffer = {}
        self.utt_handler = open(utt_path, 'r')

        with open(tri_path, 'r') as tp:
            for self.tot_line_num, line in enumerate(tp):
                pass
            self.tot_line_num += 1

        self.tris_handler = open(tri_path, 'r')
        self.tris = []
Пример #12
0
def test_write_read_int32_vector(tmpdir, endian):
    path = tmpdir.mkdir('test')

    a = np.random.randint(1, 128, 10, dtype=np.int32)
    b = np.random.randint(1, 128, 10, dtype=np.int32)
    origin = {u'Ï,é,à': a, u'あいうえお': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     endian=endian)

    d2 = {
        k: v
        for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian)
    }
    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b.scp').strpath,
                                     endian=endian).items()
    }
    with io.open(path.join('a.ark').strpath, 'rb') as fd:
        d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)}
    _compare(d2, origin)
    _compare(d5, origin)
    _compare(d6, origin)
Пример #13
0
def test_write_read(tmpdir, shape1, shape2, endian, dtype, max_cache_fd):
    path = tmpdir.mkdir("test")

    a = np.random.rand(*shape1).astype(dtype)
    b = np.random.rand(*shape2).astype(dtype)
    origin = {"Ï,é,à": a, "あいうえお": b}
    kaldiio.save_ark(
        path.join("a.ark").strpath,
        origin,
        scp=path.join("b.scp").strpath,
        endian=endian,
    )

    d2 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath, endian=endian)}
    d5 = {
        k: v
        for k, v in kaldiio.load_scp(
            path.join("b.scp").strpath, endian=endian, max_cache_fd=max_cache_fd
        ).items()
    }
    with io.open(path.join("a.ark").strpath, "rb") as fd:
        d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)}
    _compare(d2, origin)
    _compare(d5, origin)
    _compare(d6, origin)
Пример #14
0
def _get_feats_scp_loader(feats_scp):
    # read the first line of feats.scp file
    with open(feats_scp) as f:
        key, value = f.readlines()[0].replace("\n", "").split()

    # check scp type
    if ":" in value:
        value_1, value_2 = value.split(":")
        if value_1.endswith(".ark"):
            # kaldi-ark case: utt_id_1 /path/to/utt_id_1.ark:index
            return kaldiio.load_scp(feats_scp)
        elif value_1.endswith(".h5"):
            # hdf5 case with path in hdf5: utt_id_1 /path/to/utt_id_1.h5:feats
            return HDF5ScpLoader(feats_scp)
        else:
            raise ValueError("Not supported feats.scp type.")
    else:
        if value.endswith(".h5"):
            # hdf5 case without path in hdf5: utt_id_1 /path/to/utt_id_1.h5
            return HDF5ScpLoader(feats_scp)
        elif value.endswith(".npy"):
            # npy case: utt_id_1 /path/to/utt_id_1.npy
            return NpyScpLoader(feats_scp)
        else:
            raise ValueError("Not supported feats.scp type.")
Пример #15
0
def test_write_read_multiark(tmpdir, endian, dtype):
    path = tmpdir.mkdir("test")

    a = np.random.rand(1000, 120).astype(dtype)
    b = np.random.rand(10, 120).astype(dtype)
    origin = {"Ï,é,à": a, "あいうえお": b}

    kaldiio.save_ark(
        path.join("a.ark").strpath,
        origin,
        scp=path.join("b.scp").strpath,
        endian=endian,
    )

    c = np.random.rand(1000, 120).astype(dtype)
    d = np.random.rand(10, 120).astype(dtype)
    origin.update({"c": c, "d": d})
    with io.open(path.join("b.scp").strpath, "a", encoding="utf-8") as f:
        kaldiio.save_ark(path.join("b.ark").strpath, origin, scp=f, endian=endian)

    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join("b.scp").strpath, endian=endian).items()
    }
    _compare(d5, origin)
Пример #16
0
def test_write_read_compress(tmpdir, compression_method, endian):
    path = tmpdir.mkdir('test')

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {u'Ï,é,à': a, u'あいうえお': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     compression_method=compression_method,
                     endian=endian)

    d2 = {
        k: v
        for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian)
    }
    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b.scp').strpath,
                                     endian=endian).items()
    }
    with io.open(path.join('a.ark').strpath, 'rb') as fd:
        d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)}
    _compare_allclose(d2, origin, atol=1e-1)
    _compare_allclose(d5, origin, atol=1e-1)
    _compare_allclose(d6, origin, atol=1e-1)
Пример #17
0
def test_write_read(tmpdir, shape1, shape2, endian, dtype):
    path = tmpdir.mkdir('test')

    a = np.random.rand(*shape1).astype(dtype)
    b = np.random.rand(*shape2).astype(dtype)
    origin = {u'Ï,é,à': a, u'あいうえお': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     endian=endian)

    d2 = {
        k: v
        for k, v in kaldiio.load_ark(path.join('a.ark').strpath, endian=endian)
    }
    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('b.scp').strpath,
                                     endian=endian).items()
    }
    with io.open(path.join('a.ark').strpath, 'rb') as fd:
        d6 = {k: v for k, v in kaldiio.load_ark(fd, endian=endian)}
    _compare(d2, origin)
    _compare(d5, origin)
    _compare(d6, origin)
Пример #18
0
 def __init__(self, wav_scp_path, text_path, tgt_dict, shuffle=False):
     super().__init__()
     self._sample_rate = None
     self.wav_dict = kaldiio.load_scp(wav_scp_path)
     self.text_dict, self.sizes = load_text(text_path, tgt_dict)
     self.id2key = list(self.text_dict.keys())
     self.tgt_dict = tgt_dict
     self.shuffle = shuffle
Пример #19
0
 def __init__(self, config=None):
     super().__init__(config=config)
     self.seg_wav = kaldiio.load_scp(self.hparams.wav_scp,
                                     segments=self.hparams.seg_file)
     if self.hparams.spectral_augmentation is not None:
         self.spectral_aug = SpecAugment(self.hparams.spectral_augmentation)
     else:
         self.spectral_aug = None
Пример #20
0
 def __init__(self, storage_path: Pathlike, *args, **kwargs):
     if not is_module_available('kaldiio'):
         raise ValueError(
             "To read Kaldi feats.scp, please 'pip install kaldiio' first.")
     import kaldiio
     super().__init__()
     self.storage_path = storage_path
     self.storage = kaldiio.load_scp(str(self.storage_path))
Пример #21
0
    def _read_scp(self, input_file):
        loader = kaldiio.load_scp(input_file)
        feats = {}

        for k in loader:
            feats[k] = loader[k]

        return feats
Пример #22
0
def pipe_wav_loader(path, float_dtype):
    # The file is as follows:
    #   utterance_id_A cat a.wav |
    #   utterance_id_B cat b.wav |

    # NOTE(kamo): I don't think this case is practical
    # because subprocess takes much times due to fork().

    # NOTE(kamo): kaldiio doesn't normalize the signal.
    loader = kaldiio.load_scp(path)
    return AdapterForSoundScpReader(loader, float_dtype)
Пример #23
0
def test_write_read_ascii(tmpdir):
    path = tmpdir.mkdir("test")
    a = np.random.rand(10, 10).astype(np.float32)
    b = np.random.rand(5, 35).astype(np.float32)
    origin = {"Ï,é,à": a, "あいうえお": b}
    kaldiio.save_ark(
        path.join("a.ark").strpath, origin, scp=path.join("a.scp").strpath, text=True
    )
    d2 = {k: v for k, v in kaldiio.load_ark(path.join("a.ark").strpath)}
    d5 = {k: v for k, v in kaldiio.load_scp(path.join("a.scp").strpath).items()}
    _compare_allclose(d2, origin)
    _compare_allclose(d5, origin)
    def __init__(self, data_dir, wnd_size=170, rnd_chunks=True, apply_vad=True, cache=0):
        # data path
        self.path = data_dir
        self.wnd_size = wnd_size
        self.hop_size = wnd_size // 2
        self.rnd_chunks = rnd_chunks
        self.apply_vad = apply_vad
        self.num_chunks = 128  # int(self._average_vad() // self.hop_size)
        print('[INFO] number of windows sampled from every file: %d' % self.num_chunks)

        depends = [os.path.join(self.path, x) for x in ['feats.scp', 'vad.scp']]

        self.feat_reader = kaldiio.load_scp(depends[0])
        self.utter_list = list(self.feat_reader.keys())

        if self.apply_vad:
            vadf = os.path.join(self.path, 'vad.scp')
            self.vadscp = kaldiio.load_scp(vadf)
            print('[INFO] applying VAD: %s' % vadf)
            self._remove_silent_utter()
        else:
            print('[INFO] do not apply VAD, expect non silent files are fed')
Пример #25
0
    def load_from_scp(self, options, filename=None):

        scpfile = filename or self.expdir / "feats.scp"

        features = dict(kaldiio.load_scp(str(scpfile)))
        labels = {
            uttid: self.coder.encode(read_task(task))
            for spkr in self.dataconf.sections() for uttid, task in zip(
                *self.load_tasks(Path(self.dataconf[spkr].get("tasks"))))
        }

        features, labels = self.check_errors(features, labels, options.errors)
        return self.make_splits(features, labels, options)
Пример #26
0
    def __init__(
        self,
        wav_scp,
        segments=None,
        audio_length_threshold=None,
        return_utt_id=False,
        return_sampling_rate=False,
        allow_cache=False,
    ):
        """Initialize dataset.

        Args:
            wav_scp (str): Kaldi-style wav.scp file.
            feats_scp (str): Kaldi-style fests.scp file.
            segments (str): Kaldi-style segments file.
            audio_length_threshold (int): Threshold to remove short audio files.
            return_utt_id (bool): Whether to return utterance id.
            return_sampling_rate (bool): Wheter to return sampling rate.
            allow_cache (bool): Whether to allow cache of the loaded files.

        """
        # load scp as lazy dict
        audio_loader = kaldiio.load_scp(wav_scp, segments=segments)
        audio_keys = list(audio_loader.keys())

        # filter by threshold
        if audio_length_threshold is not None:
            audio_lengths = [
                audio.shape[0] for _, audio in audio_loader.values()
            ]
            idxs = [
                idx for idx in range(len(audio_keys))
                if audio_lengths[idx] > audio_length_threshold
            ]
            if len(audio_keys) != len(idxs):
                logging.warning(
                    f"Some files are filtered by audio length threshold "
                    f"({len(audio_keys)} -> {len(idxs)}).")
            audio_keys = [audio_keys[idx] for idx in idxs]

        self.audio_loader = audio_loader
        self.utt_ids = audio_keys
        self.return_utt_id = return_utt_id
        self.return_sampling_rate = return_sampling_rate
        self.allow_cache = allow_cache

        if allow_cache:
            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
            self.manager = Manager()
            self.caches = self.manager.list()
            self.caches += [() for _ in range(len(self.utt_ids))]
Пример #27
0
 def __init__(self, scp_path, trials, use_gpu=False, pre_load=False):
     super(KaldiTester, self).__init__()
     self.datas = load_scp(scp_path)
     self.tot = len(self.datas)
     self.use_gpu = use_gpu
     self.trials = trials
     self.miss = 0
     self.pre_load = pre_load
     if self.pre_load:
         self.data = {}
         for key in self.datas:
             self.data[key] = torch.from_numpy(self.datas.get(key))
             if self.use_gpu:
                 self.data[key] = self.data[key].cuda()
Пример #28
0
 def process(self, ):
     label_dict = self.get_label_dict()
     data = kaldiio.load_scp(self.feat_scp)
     with open(self.manifest, 'w') as fid:
         for key in data:
             x = data[key]
             x = x - self.mean
             x = x * self.istd
             label = Label2Indx[label_dict[key]]
             datum = {'feats': x, 'label': label}
             save_path = self.store_folder + '/' + str(
                 uuid.uuid1()) + '__' + key + '.npy'
             np.save(save_path, datum)
             fid.write(save_path + '\n')
Пример #29
0
 def __init__(self, scp_path, trial_path, use_gpu=True):
     self.datas = load_scp(scp_path)
     self.trials = []
     self.utt_buffer = {}
     tmp_set = set()
     with open(trial_path, 'r') as f:
         for line in f:
             utt1, utt2, label = line.strip().split()
             tmp_set.add(utt1)
             tmp_set.add(utt2)
             self.trials.append((utt1, utt2, int(label)))
     for utt in tmp_set:
         xvec = torch.from_numpy(self.datas.get(utt)).float()
         self.utt_buffer[utt] = xvec.cuda() if use_gpu else xvec
Пример #30
0
def test_write_load_ascii(tmpdir):
    path = tmpdir.mkdir('test')
    a = np.random.rand(10, 10).astype(np.float32)
    b = np.random.rand(5, 35).astype(np.float32)
    origin = {'a': a, 'b': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('a.scp').strpath,
                     text=True)
    d2 = {k: v for k, v in kaldiio.load_ark(path.join('a.ark').strpath)}
    d5 = {
        k: v
        for k, v in kaldiio.load_scp(path.join('a.scp').strpath).items()
    }
    _compare_allclose(d2, origin)
    _compare_allclose(d5, origin)