예제 #1
0
def load_inputs_and_targets(batch,
                            use_speaker_embedding=False,
                            use_second_target=False):
    """Load inputs and targets from list of dicts (json)

    :param list batch: list of dict which is subset of loaded data.json
    :param bool use_speaker_embedding: whether to load speaker embedding vector
    :param bool use_second_target: whether to load second target vector
    :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
    :rtype: list of int ndarray
    :return: list of target feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)]
    :rtype: list of float ndarray
    :return: list of speaker embedding vectors
    :rtype: list of float adarray
    :return: list of second target feature sequences [(T_1, V), (T_2, V), ..., (T_B, V)],
    :rtype: list of float ndarray
    """
    # load acoustic features and target sequence of token ids
    xs = [b[1]['output'][0]['tokenid'].split() for b in batch]
    ys = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch]

    # get index of non-zero length samples
    nonzero_idx = list(filter(lambda i: len(xs[i]) > 0, range(len(xs))))
    if len(nonzero_idx) != len(xs):
        logging.warning(
            'Input sequences include empty tokenid (batch %d -> %d).' %
            (len(xs), len(nonzero_idx)))

    # sort in input length
    nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))

    # remove zero-length samples
    xs = [
        np.fromiter(map(int, xs[i]), dtype=np.int64)
        for i in nonzero_sorted_idx
    ]
    ys = [ys[i] for i in nonzero_sorted_idx]

    # load second target for CHBG
    if use_second_target:
        spcs = [kaldi_io_py.read_mat(b[1]['input'][1]['feat']) for b in batch]
        spcs = [spcs[i] for i in nonzero_sorted_idx]
    else:
        spcs = None

    # load speaker embedding
    if use_speaker_embedding:
        spembs = [
            kaldi_io_py.read_vec_flt(b[1]['input'][1]['feat']) for b in batch
        ]
        spembs = [spembs[i] for i in nonzero_sorted_idx]
    else:
        spembs = None

    return xs, ys, spembs, spcs
def converter_kaldi(batch, device=None, use_speaker_embedding=None):
    # batch only has one minibatch utterance, which is specified by batch[0]
    batch = batch[0]
    for data in batch:
        feat_asr = kaldi_io_py.read_mat(data[1]['input'][0]['feat'])
        feat_tts = kaldi_io_py.read_mat(data[1]['input'][1]['feat'])
        data[1]['feat_asr'] = feat_asr
        data[1]['feat_tts'] = feat_tts
        if use_speaker_embedding is not None:
            feat_spembs = kaldi_io_py.read_vec_flt(data[1]['input'][2]['feat'])
            data[1]['feat_spembs'] = feat_spembs

    return batch
예제 #3
0
def load_inputs_and_targets(batch):
    """Function to load inputs and targets from list of dicts

    :param list batch: list of dict which is subset of loaded data.json
    :return: list of input feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)]
    :rtype: list of float ndarray
    :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
    :rtype: list of int ndarray
    """
    # load acoustic features and target sequence of token ids
    xs = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch]
    ys = [b[1]['output'][0]['tokenid'].split() for b in batch]

    # get index of non-zero length samples
    nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs)))
    # sort in input lengths
    nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
    if len(nonzero_sorted_idx) != len(xs):
        logging.warning('Target sequences include empty tokenid (batch %d -> %d).' % (
            len(xs), len(nonzero_sorted_idx)))

    # remove zero-length samples
    xs = [xs[i] for i in nonzero_sorted_idx]
    ys = [np.fromiter(map(int, ys[i]), dtype=np.int64) for i in nonzero_sorted_idx]

    return xs, ys
예제 #4
0
def converter_kaldi(batch, device=None):
    # batch only has one minibatch utterance, which is specified by batch[0]
    batch = batch[0]
    for data in batch:
        feat = kaldi_io_py.read_mat(data[1]['input'][0]['feat'])
        data[1]['feat'] = feat

    return batch
예제 #5
0
def load_inputs_and_targets(batch,
                            sort_in_outputs=False,
                            use_speaker_embedding=False):
    """Function to load inputs and targets from list of dicts

    :param list batch: list of dict which is subset of loaded data.json
    :param bool sort_in_outputs: whether to sort in output lengths
    :param bool use_speaker_embedding: whether to load speaker embedding vector
    :return: list of input feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)]
    :rtype: list of float ndarray
    :return: list of target token id sequences [(T_1), (T_2), ..., (T_B)]
    :rtype: list of int ndarray
    :return: list of speaker embedding vectors (only if use_speaker_embedding = True)
    :rtype: list of float adarray
    """

    # load acoustic features and target sequence of token ids
    xs = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch]
    ys = [b[1]['output'][0]['tokenid'].split() for b in batch]

    # get index of non-zero length samples
    nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs)))
    if sort_in_outputs:
        # sort in output lengths
        nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(ys[i]))
    else:
        # sort in input lengths
        nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
    if len(nonzero_sorted_idx) != len(xs):
        logging.warning(
            'Target sequences include empty tokenid (batch %d -> %d).' %
            (len(xs), len(nonzero_sorted_idx)))

    # remove zero-length samples
    xs = [xs[i] for i in nonzero_sorted_idx]
    ys = [
        np.fromiter(map(int, ys[i]), dtype=np.int64)
        for i in nonzero_sorted_idx
    ]

    # load speaker embedding
    if use_speaker_embedding:
        spembs = [
            kaldi_io_py.read_vec_flt(b[1]['input'][1]['feat']) for b in batch
        ]
        spembs = [spembs[i] for i in nonzero_sorted_idx]
        return xs, ys, spembs
    else:
        return xs, ys
예제 #6
0
    def __call__(self, batch, is_training=True):
        # batch should be located in list
        assert len(batch) == 1
        batch = batch[0]

        # get eos
        eos = str(int(batch[0][1]['output'][0]['shape'][1]) - 1)

        # get target features and input character sequence
        xs = [b[1]['output'][0]['tokenid'].split() + [eos] for b in batch]
        ys = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch]

        # remove empty sequence and get sort along with length
        filtered_idx = filter(lambda i: len(xs[i]) > 0, range(len(ys)))
        sorted_idx = sorted(filtered_idx, key=lambda i: -len(xs[i]))
        xs = [np.fromiter(map(int, xs[i]), dtype=np.int64) for i in sorted_idx]
        ys = [ys[i] for i in sorted_idx]

        # get list of lengths (must be tensor for DataParallel)
        ilens = torch.from_numpy(
            np.fromiter((x.shape[0] for x in xs), dtype=np.int64))
        olens = torch.from_numpy(
            np.fromiter((y.shape[0] for y in ys), dtype=np.int64))

        # perform padding and convert to tensor
        xs = torch.from_numpy(pad_ndarray_list(xs, 0)).long()
        ys = torch.from_numpy(pad_ndarray_list(ys, 0)).float()

        # make labels for stop prediction
        labels = ys.new(ys.size(0), ys.size(1)).zero_()
        for i, l in enumerate(olens):
            labels[i, l - 1:] = 1

        # TODO(kan-bayashi): need to be fixed in pytorch v4
        if torch_is_old:
            xs = Variable(xs, volatile=not is_training)
            ys = Variable(ys, volatile=not is_training)
            labels = Variable(labels, volatile=not is_training)

        if sum(self.device) >= 0:
            xs = xs.cuda()
            ys = ys.cuda()
            labels = labels.cuda()

        if self.return_targets:
            return xs, ilens, ys, labels, olens
        else:
            return xs, ilens, ys
예제 #7
0
파일: lazy_io.py 프로젝트: weiwchu/espnet
def read_mat_scp(file_or_fd):
    """ generator(key,mat) = read_mat_scp(file_or_fd)
   Returns generator of (key,matrix) tuples, read according to kaldi scp.
   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.

   Iterate the scp:
   for key,mat in kaldi_io.read_mat_scp(file):
     ...

   Read scp to a 'dictionary':
   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
    """
    fd = open_or_fd(file_or_fd)
    try:
        for line in fd:
            (key, rxfile) = line.split(' '.encode(), 1)
            if rxfile[:2] == '\0B':
                mat = _read_mat_binary(rxfile)
            else:
                mat = read_mat(rxfile)
            yield key.decode(), mat
    finally:
        if fd is not file_or_fd: fd.close()
예제 #8
0
                fc += 1
                spk2gender[sp[0]] = 'f'
            else:
                continue
        else:
            if mc < MAX_MALE:
                mc += 1
                spk2gender[sp[0]] = 'm'
            else:
                continue

        spk2utt[sp[0]] = utts

        spk_feats = []
        for u in utts:
            utt_feat = kaldi_io_py.read_mat(feats[u])
            if utt_feat.shape[0] > 1:
                mean_feat = np.concatenate(
                    (np.mean(utt_feat, axis=0), np.var(utt_feat, axis=0)))
            else:
                mean_feat = utt_feat
            spk_feats.append(mean_feat)
        spk_feats = np.array(spk_feats)

        spk2featlen[sp[0]] = spk_feats.shape[0]
        #print(spk_feats.shape)
        X.append(spk_feats)

nspk = len(spk2gender.keys())
print("Number of speakers", nspk)
예제 #9
0
def recog(args):
    '''Run recognition'''
    # display chainer version
    logging.info('chainer version = ' + chainer.__version__)

    # seed setting (chainer seed may not need it)
    os.environ["CHAINER_SEED"] = str(args.seed)
    logging.info('chainer seed = ' + os.environ['CHAINER_SEED'])

    # read training config
    with open(args.model_conf, "rb") as f:
        logging.info('reading a model config file from' + args.model_conf)
        idim, odim, train_args = pickle.load(f)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)
    chainer.serializers.load_npz(args.model, model)

    # read rnnlm
    if args.rnnlm:
        rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(len(train_args.char_list), 650))
        chainer.serializers.load_npz(args.rnnlm, rnnlm)
    else:
        rnnlm = None

    if args.word_rnnlm:
        if not args.word_dict:
            logging.error(
                'word dictionary file is not specified for the word RNNLM.')
            sys.exit(1)

        word_dict = load_labeldict(args.word_dict)
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(len(word_dict), 650))
        chainer.serializers.load_npz(args.word_rnnlm, word_rnnlm)

        if rnnlm is not None:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # read json data
    with open(args.recog_json, 'rb') as f:
        recog_json = json.load(f)['utts']

    new_json = {}
    for name in recog_json.keys():
        feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat'])
        logging.info('decoding ' + name)
        nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm)
        # get 1best and remove sos
        y_hat = nbest_hyps[0]['yseq'][1:]
        y_true = map(int, recog_json[name]['output'][0]['tokenid'].split())

        # print out decoding result
        seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
        seq_true = [train_args.char_list[int(idx)] for idx in y_true]
        seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
        seq_true_text = "".join(seq_true).replace('<space>', ' ')
        logging.info("groundtruth[%s]: " + seq_true_text, name)
        logging.info("prediction [%s]: " + seq_hat_text, name)

        # copy old json info
        new_json[name] = dict()
        new_json[name]['utt2spk'] = recog_json[name]['utt2spk']

        # add 1-best recognition results to json
        logging.debug("dump token id")
        out_dic = dict()
        for _key in recog_json[name]['output'][0]:
            out_dic[_key] = recog_json[name]['output'][0][_key]

        # TODO(karita) make consistent to chainer as idx[0] not idx
        out_dic['rec_tokenid'] = " ".join([str(idx[0]) for idx in y_hat])
        logging.debug("dump token")
        out_dic['rec_token'] = " ".join(seq_hat)
        logging.debug("dump text")
        out_dic['rec_text'] = seq_hat_text

        new_json[name]['output'] = [out_dic]
        # TODO(nelson): Modify this part when saving more than 1 hyp is enabled
        # add n-best recognition results with scores
        if args.beam_size > 1 and len(nbest_hyps) > 1:
            for i, hyp in enumerate(nbest_hyps):
                y_hat = hyp['yseq'][1:]
                seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
                seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
                new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) + ']'] \
                    = " ".join([str(idx[0]) for idx in y_hat])
                new_json[name]['rec_token' + '[' + '{:05d}'.format(i) +
                               ']'] = " ".join(seq_hat)
                new_json[name]['rec_text' + '[' + '{:05d}'.format(i) +
                               ']'] = seq_hat_text
                new_json[name]['score' + '[' + '{:05d}'.format(i) +
                               ']'] = hyp['score']

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_json
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #10
0
old_json_dir = 'dump/train_100/deltafalse/split_utt_spk'
new_feats_scp = 'data/erep_train_100/feats.scp'

new_json_dir = 'data/erep_train_100/json'

if not exists(new_json_dir):
    os.makedirs(new_json_dir)

print('Create feats dictionary...')
feats_dict = {}
with open(new_feats_scp) as f:
    for line in f.read().splitlines():
        sp = line.split()
        feats_dict[sp[0]] = {
            u'path': sp[1],
            u'shape': kaldi_io_py.read_mat(sp[1]).shape
        }
print('Done reading features!')

print('Reading data jsons...')
djsons = [x for x in os.listdir(old_json_dir) if x.endswith('.json')]
for jsfile in djsons:
    print('Reading ' + jsfile)
    with open(join(old_json_dir,
                   jsfile)) as f1, open(join(new_json_dir, jsfile), 'w') as f2:
        js = json.load(f1)
        utt_ids = js[u'utts'].keys()
        for k in utt_ids:
            js[u'utts'][k][u'input'][0][u'feat'] = feats_dict[k][u'path']
            js[u'utts'][k][u'input'][0][u'shape'] = feats_dict[k][u'shape']
예제 #11
0
def recog(args):
    '''Run recognition'''
    # display chainer version
    logging.info('chainer version = ' + chainer.__version__)

    # seed setting (chainer seed may not need it)
    os.environ["CHAINER_SEED"] = str(args.seed)
    logging.info('chainer seed = ' + os.environ['CHAINER_SEED'])

    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)
    chainer_load(args.model, model)

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(len(train_args.char_list), rnnlm_args.unit))
        chainer_load(args.rnnlm, rnnlm)
    else:
        rnnlm = None

    if args.word_rnnlm:
        if not args.word_dict:
            logging.error(
                'word dictionary file is not specified for the word RNNLM.')
            sys.exit(1)

        rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf)
        word_dict = load_labeldict(args.word_dict)
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(len(word_dict), rnnlm_args.unit))
        chainer_load(args.word_rnnlm, word_rnnlm)

        if rnnlm is not None:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']

    # decode each utterance
    new_js = {}
    with chainer.no_backprop_mode():
        for idx, name in enumerate(js.keys(), 1):
            logging.info('(%d/%d) decoding ' + name, idx, len(js.keys()))
            feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
            nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm)
            new_js[name] = add_results_to_json(js[name], nbest_hyps,
                                               train_args.char_list)

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #12
0
def recog(args):
    '''Run recognition'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)
    torch_load(args.model, model)
    e2e.recog_args = args

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer,
                             rnnlm_args.unit))
        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
        word_dict = rnnlm_args.char_list_dict
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer,
                             rnnlm_args.unit))
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # gpu
    if args.ngpu == 1:
        gpu_id = range(args.ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()
        if rnnlm:
            rnnlm.cuda()

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']
    new_js = {}

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info('(%d/%d) decoding ' + name, idx, len(js.keys()))
                feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
                nbest_hyps = e2e.recognize(feat, args, train_args.char_list,
                                           rnnlm)
                new_js[name] = add_results_to_json(js[name], nbest_hyps,
                                                   train_args.char_list)
    else:
        try:
            from itertools import zip_longest as zip_longest
        except Exception:
            from itertools import izip_longest as zip_longest

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data
        keys = list(js.keys())
        feat_lens = [js[key]['input'][0]['shape'][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)),
                              key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                feats = [
                    kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
                    for name in names
                ]
                nbest_hyps = e2e.recognize_batch(feats,
                                                 args,
                                                 train_args.char_list,
                                                 rnnlm=rnnlm)
                for i, nbest_hyp in enumerate(nbest_hyps):
                    name = names[i]
                    new_js[name] = add_results_to_json(js[name], nbest_hyp,
                                                       train_args.char_list)

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #13
0
def recog(args):
    '''Run recognition'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)
    torch_load(args.model, model)

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.unit))
        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        if not args.word_dict:
            logging.error('word dictionary file is not specified for the word RNNLM.')
            sys.exit(1)

        rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf)
        word_dict = load_labeldict(args.word_dict)
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(lm_pytorch.RNNLM(len(word_dict), rnnlm_args.unit))
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict, char_dict))
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor,
                                              word_dict, char_dict))

    # read json data
    with open(args.recog_json, 'rb') as f:
        recog_json = json.load(f)['utts']

    new_json = {}
    with torch.no_grad():
        for name in recog_json.keys():
            feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat'])
            nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm=rnnlm)
            # get 1best and remove sos
            y_hat = nbest_hyps[0]['yseq'][1:]
            y_true = map(int, recog_json[name]['output'][0]['tokenid'].split())

            # print out decoding result
            seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
            seq_true = [train_args.char_list[int(idx)] for idx in y_true]
            seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
            seq_true_text = "".join(seq_true).replace('<space>', ' ')
            logging.info("groundtruth[%s]: " + seq_true_text, name)
            logging.info("prediction [%s]: " + seq_hat_text, name)

            # copy old json info
            new_json[name] = dict()
            new_json[name]['utt2spk'] = recog_json[name]['utt2spk']

            # added recognition results to json
            logging.debug("dump token id")
            out_dic = dict()
            for _key in recog_json[name]['output'][0]:
                out_dic[_key] = recog_json[name]['output'][0][_key]

            # TODO(karita) make consistent to chainer as idx[0] not idx
            out_dic['rec_tokenid'] = " ".join([str(idx) for idx in y_hat])
            logging.debug("dump token")
            out_dic['rec_token'] = " ".join(seq_hat)
            logging.debug("dump text")
            out_dic['rec_text'] = seq_hat_text

            new_json[name]['output'] = [out_dic]
            # TODO(nelson): Modify this part when saving more than 1 hyp is enabled
            # add n-best recognition results with scores
            if args.beam_size > 1 and len(nbest_hyps) > 1:
                for i, hyp in enumerate(nbest_hyps):
                    y_hat = hyp['yseq'][1:]
                    seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
                    seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
                    new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) + ']'] = \
                        " ".join([str(idx) for idx in y_hat])
                    new_json[name]['rec_token' + '[' + '{:05d}'.format(i) + ']'] = " ".join(seq_hat)
                    new_json[name]['rec_text' + '[' + '{:05d}'.format(i) + ']'] = seq_hat_text
                    new_json[name]['score' + '[' + '{:05d}'.format(i) + ']'] = hyp['score']

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(json.dumps({'utts': new_json}, indent=4, sort_keys=True).encode('utf_8'))
예제 #14
0
def encode(args):
    '''Get ASR encoded representations...probably for xvectors'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, odim_adv, train_args = get_model_conf(args.model,
                                                      args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args, odim_adv=odim_adv)
    model = Loss(e2e, train_args.mtlalpha)
    if train_args.rnnlm is not None:
        # set rnnlm. external rnnlm is used for recognition.
        model.predictor.rnnlm = rnnlm
    torch_load(args.model, model)
    e2e.recog_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = range(args.ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()

    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.feats_out, args.feats_out)

    if args.batchsize == 0:
        with torch.no_grad():
            with kaldi_io_py.open_or_fd(arkscp,
                                        'wb') as f, open(args.feats_in,
                                                         'rb') as f2:
                lines = f2.read().splitlines()
                for idx, line in enumerate(lines, 1):
                    line = line.strip().split()
                    name = line[0]
                    logging.info('(%d/%d) decoding ' + name, idx, len(lines))
                    feat = kaldi_io_py.read_mat(line[1])
                    rep = e2e.erep(feat)
                    logging.info('Rep shape: %s', rep.shape)
                    kaldi_io_py.write_mat(f, rep, name)
    else:
        try:
            from itertools import zip_longest as zip_longest
        except Exception:
            from itertools import izip_longest as zip_longest

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # Create json object for batch processing
        logging.info("Creating json for batch processing...")
        js = {}
        with open(args.feats_in, 'rb') as f:
            lines = f.read().splitlines()
            for line in lines:
                line = line.strip().split()
                name = line[0]
                featpath = line[1]
                feat_shape = kaldi_io_py.read_mat(featpath).shape
                js[name] = {'feat': featpath, 'shape': feat_shape}

        # sort data
        logging.info("Sorting data for batch processing...")
        keys = list(js.keys())
        feat_lens = [js[key]['shape'][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)),
                              key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
                for names in grouper(args.batchsize, keys, None):
                    names = [name for name in names if name]
                    feats = [
                        kaldi_io_py.read_mat(js[name]['feat'])
                        for name in names
                    ]
                    reps, replens = e2e.erep_batch(feats)
                    print(reps.shape, replens)
                    for i, rep in enumerate(reps):
                        name = names[i]
                        kaldi_io_py.write_mat(f, rep, name)
예제 #15
0
def recog(args):
    '''Run recognition'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, odim_adv, train_args = get_model_conf(args.model,
                                                      args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim_adv, train_args)
    model = Loss(e2e)
    torch_load(args.model, model)
    e2e.recog_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = range(args.ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']
    new_js = {}

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info('(%d/%d) decoding ' + name, idx, len(js.keys()))
                feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
                nbest_hyps = e2e.recognize(feat)
                new_js[name] = add_results_to_json(js[name], nbest_hyps)
    else:
        try:
            from itertools import zip_longest as zip_longest
        except Exception:
            from itertools import izip_longest as zip_longest

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data
        keys = list(js.keys())
        feat_lens = [js[key]['input'][0]['shape'][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)),
                              key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                feats = [
                    kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
                    for name in names
                ]
                nbest_hyps = e2e.recognize_batch(feats)
                for i, nbest_hyp in enumerate(nbest_hyps):
                    name = names[i]
                    new_js[name] = add_results_to_json(js[name], nbest_hyp)

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #16
0
def recog(args):
    '''Run recognition'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    with open(args.model_conf, "rb") as f:
        logging.info('reading a model config file from' + args.model_conf)
        idim, odim, train_args = pickle.load(f)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)

    def cpu_loader(storage, location):
        return storage

    def remove_dataparallel(state_dict):
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if k.startswith("module."):
                k = k[7:]
            new_state_dict[k] = v
        return new_state_dict

    model.load_state_dict(
        remove_dataparallel(torch.load(args.model, map_location=cpu_loader)))

    # read rnnlm
    if args.rnnlm:
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(train_args.char_list), 650))
        rnnlm.load_state_dict(torch.load(args.rnnlm, map_location=cpu_loader))
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        if not args.word_dict:
            logging.error(
                'word dictionary file is not specified for the word RNNLM.')
            sys.exit(1)

        word_dict = load_labeldict(args.word_dict)
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(word_dict), 650))
        word_rnnlm.load_state_dict(
            torch.load(args.word_rnnlm, map_location=cpu_loader))
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # read json data
    with open(args.recog_json, 'rb') as f:
        recog_json = json.load(f)['utts']

    if not torch_is_old:
        torch.set_grad_enabled(False)

    new_json = {}
    for name in recog_json.keys():
        if args.input_tensor:
            feat = load_lua(recog_json[name]['input'][0]['feat']).numpy()
        else:
            feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat'])
        nbest_hyps = e2e.recognize(feat,
                                   args,
                                   train_args.char_list,
                                   rnnlm=rnnlm)
        # get 1best and remove sos
        y_hat = nbest_hyps[0]['yseq'][1:]
        y_true = map(int, recog_json[name]['output'][0]['tokenid'].split())

        # print out decoding result
        seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
        seq_true = [train_args.char_list[int(idx)] for idx in y_true]
        seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
        seq_true_text = "".join(seq_true).replace('<space>', ' ')
        logging.info("groundtruth[%s]: " + seq_true_text, name)
        logging.info("prediction [%s]: " + seq_hat_text, name)

        # copy old json info
        new_json[name] = dict()
        new_json[name]['utt2spk'] = recog_json[name]['utt2spk']

        # added recognition results to json
        logging.debug("dump token id")
        out_dic = dict()
        for _key in recog_json[name]['output'][0]:
            out_dic[_key] = recog_json[name]['output'][0][_key]

        # TODO(karita) make consistent to chainer as idx[0] not idx
        out_dic['rec_tokenid'] = " ".join([str(idx) for idx in y_hat])
        logging.debug("dump token")
        out_dic['rec_token'] = " ".join(seq_hat)
        logging.debug("dump text")
        out_dic['rec_text'] = seq_hat_text

        new_json[name]['output'] = [out_dic]
        # TODO(nelson): Modify this part when saving more than 1 hyp is enabled
        # add n-best recognition results with scores
        if args.beam_size > 1 and len(nbest_hyps) > 1:
            for i, hyp in enumerate(nbest_hyps):
                y_hat = hyp['yseq'][1:]
                seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
                seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
                new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) +
                               ']'] = " ".join([str(idx) for idx in y_hat])
                new_json[name]['rec_token' + '[' + '{:05d}'.format(i) +
                               ']'] = " ".join(seq_hat)
                new_json[name]['rec_text' + '[' + '{:05d}'.format(i) +
                               ']'] = seq_hat_text
                new_json[name]['score' + '[' + '{:05d}'.format(i) +
                               ']'] = hyp['score']

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_json
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #17
0
def recog(args):
    '''Run recognition'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = Loss(e2e, train_args.mtlalpha)
    torch_load(args.model, model)

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.unit))
        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        if not args.word_dict:
            logging.error(
                'word dictionary file is not specified for the word RNNLM.')
            sys.exit(1)

        rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf)
        word_dict = load_labeldict(args.word_dict)
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(word_dict), rnnlm_args.unit))
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']

    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            logging.info('(%d/%d) decoding ' + name, idx, len(js.keys()))
            feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat'])
            nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm)
            new_js[name] = add_results_to_json(js[name], nbest_hyps,
                                               train_args.char_list)

    # TODO(watanabe) fix character coding problems when saving it
    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
예제 #18
0
파일: lazy_io.py 프로젝트: weiwchu/espnet
 def __getitem__(self, item):
     return read_mat(self.loader_dict[item.decode('utf-8')])