def load_inputs_and_targets(batch, use_speaker_embedding=False, use_second_target=False): """Load inputs and targets from list of dicts (json) :param list batch: list of dict which is subset of loaded data.json :param bool use_speaker_embedding: whether to load speaker embedding vector :param bool use_second_target: whether to load second target vector :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)] :rtype: list of int ndarray :return: list of target feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)] :rtype: list of float ndarray :return: list of speaker embedding vectors :rtype: list of float adarray :return: list of second target feature sequences [(T_1, V), (T_2, V), ..., (T_B, V)], :rtype: list of float ndarray """ # load acoustic features and target sequence of token ids xs = [b[1]['output'][0]['tokenid'].split() for b in batch] ys = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch] # get index of non-zero length samples nonzero_idx = list(filter(lambda i: len(xs[i]) > 0, range(len(xs)))) if len(nonzero_idx) != len(xs): logging.warning( 'Input sequences include empty tokenid (batch %d -> %d).' % (len(xs), len(nonzero_idx))) # sort in input length nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i])) # remove zero-length samples xs = [ np.fromiter(map(int, xs[i]), dtype=np.int64) for i in nonzero_sorted_idx ] ys = [ys[i] for i in nonzero_sorted_idx] # load second target for CHBG if use_second_target: spcs = [kaldi_io_py.read_mat(b[1]['input'][1]['feat']) for b in batch] spcs = [spcs[i] for i in nonzero_sorted_idx] else: spcs = None # load speaker embedding if use_speaker_embedding: spembs = [ kaldi_io_py.read_vec_flt(b[1]['input'][1]['feat']) for b in batch ] spembs = [spembs[i] for i in nonzero_sorted_idx] else: spembs = None return xs, ys, spembs, spcs
def converter_kaldi(batch, device=None, use_speaker_embedding=None): # batch only has one minibatch utterance, which is specified by batch[0] batch = batch[0] for data in batch: feat_asr = kaldi_io_py.read_mat(data[1]['input'][0]['feat']) feat_tts = kaldi_io_py.read_mat(data[1]['input'][1]['feat']) data[1]['feat_asr'] = feat_asr data[1]['feat_tts'] = feat_tts if use_speaker_embedding is not None: feat_spembs = kaldi_io_py.read_vec_flt(data[1]['input'][2]['feat']) data[1]['feat_spembs'] = feat_spembs return batch
def load_inputs_and_targets(batch): """Function to load inputs and targets from list of dicts :param list batch: list of dict which is subset of loaded data.json :return: list of input feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)] :rtype: list of float ndarray :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)] :rtype: list of int ndarray """ # load acoustic features and target sequence of token ids xs = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch] ys = [b[1]['output'][0]['tokenid'].split() for b in batch] # get index of non-zero length samples nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs))) # sort in input lengths nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i])) if len(nonzero_sorted_idx) != len(xs): logging.warning('Target sequences include empty tokenid (batch %d -> %d).' % ( len(xs), len(nonzero_sorted_idx))) # remove zero-length samples xs = [xs[i] for i in nonzero_sorted_idx] ys = [np.fromiter(map(int, ys[i]), dtype=np.int64) for i in nonzero_sorted_idx] return xs, ys
def converter_kaldi(batch, device=None): # batch only has one minibatch utterance, which is specified by batch[0] batch = batch[0] for data in batch: feat = kaldi_io_py.read_mat(data[1]['input'][0]['feat']) data[1]['feat'] = feat return batch
def load_inputs_and_targets(batch, sort_in_outputs=False, use_speaker_embedding=False): """Function to load inputs and targets from list of dicts :param list batch: list of dict which is subset of loaded data.json :param bool sort_in_outputs: whether to sort in output lengths :param bool use_speaker_embedding: whether to load speaker embedding vector :return: list of input feature sequences [(T_1, D), (T_2, D), ..., (T_B, D)] :rtype: list of float ndarray :return: list of target token id sequences [(T_1), (T_2), ..., (T_B)] :rtype: list of int ndarray :return: list of speaker embedding vectors (only if use_speaker_embedding = True) :rtype: list of float adarray """ # load acoustic features and target sequence of token ids xs = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch] ys = [b[1]['output'][0]['tokenid'].split() for b in batch] # get index of non-zero length samples nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs))) if sort_in_outputs: # sort in output lengths nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(ys[i])) else: # sort in input lengths nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i])) if len(nonzero_sorted_idx) != len(xs): logging.warning( 'Target sequences include empty tokenid (batch %d -> %d).' % (len(xs), len(nonzero_sorted_idx))) # remove zero-length samples xs = [xs[i] for i in nonzero_sorted_idx] ys = [ np.fromiter(map(int, ys[i]), dtype=np.int64) for i in nonzero_sorted_idx ] # load speaker embedding if use_speaker_embedding: spembs = [ kaldi_io_py.read_vec_flt(b[1]['input'][1]['feat']) for b in batch ] spembs = [spembs[i] for i in nonzero_sorted_idx] return xs, ys, spembs else: return xs, ys
def __call__(self, batch, is_training=True): # batch should be located in list assert len(batch) == 1 batch = batch[0] # get eos eos = str(int(batch[0][1]['output'][0]['shape'][1]) - 1) # get target features and input character sequence xs = [b[1]['output'][0]['tokenid'].split() + [eos] for b in batch] ys = [kaldi_io_py.read_mat(b[1]['input'][0]['feat']) for b in batch] # remove empty sequence and get sort along with length filtered_idx = filter(lambda i: len(xs[i]) > 0, range(len(ys))) sorted_idx = sorted(filtered_idx, key=lambda i: -len(xs[i])) xs = [np.fromiter(map(int, xs[i]), dtype=np.int64) for i in sorted_idx] ys = [ys[i] for i in sorted_idx] # get list of lengths (must be tensor for DataParallel) ilens = torch.from_numpy( np.fromiter((x.shape[0] for x in xs), dtype=np.int64)) olens = torch.from_numpy( np.fromiter((y.shape[0] for y in ys), dtype=np.int64)) # perform padding and convert to tensor xs = torch.from_numpy(pad_ndarray_list(xs, 0)).long() ys = torch.from_numpy(pad_ndarray_list(ys, 0)).float() # make labels for stop prediction labels = ys.new(ys.size(0), ys.size(1)).zero_() for i, l in enumerate(olens): labels[i, l - 1:] = 1 # TODO(kan-bayashi): need to be fixed in pytorch v4 if torch_is_old: xs = Variable(xs, volatile=not is_training) ys = Variable(ys, volatile=not is_training) labels = Variable(labels, volatile=not is_training) if sum(self.device) >= 0: xs = xs.cuda() ys = ys.cuda() labels = labels.cuda() if self.return_targets: return xs, ilens, ys, labels, olens else: return xs, ilens, ys
def read_mat_scp(file_or_fd): """ generator(key,mat) = read_mat_scp(file_or_fd) Returns generator of (key,matrix) tuples, read according to kaldi scp. file_or_fd : scp, gzipped scp, pipe or opened file descriptor. Iterate the scp: for key,mat in kaldi_io.read_mat_scp(file): ... Read scp to a 'dictionary': d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } """ fd = open_or_fd(file_or_fd) try: for line in fd: (key, rxfile) = line.split(' '.encode(), 1) if rxfile[:2] == '\0B': mat = _read_mat_binary(rxfile) else: mat = read_mat(rxfile) yield key.decode(), mat finally: if fd is not file_or_fd: fd.close()
fc += 1 spk2gender[sp[0]] = 'f' else: continue else: if mc < MAX_MALE: mc += 1 spk2gender[sp[0]] = 'm' else: continue spk2utt[sp[0]] = utts spk_feats = [] for u in utts: utt_feat = kaldi_io_py.read_mat(feats[u]) if utt_feat.shape[0] > 1: mean_feat = np.concatenate( (np.mean(utt_feat, axis=0), np.var(utt_feat, axis=0))) else: mean_feat = utt_feat spk_feats.append(mean_feat) spk_feats = np.array(spk_feats) spk2featlen[sp[0]] = spk_feats.shape[0] #print(spk_feats.shape) X.append(spk_feats) nspk = len(spk2gender.keys()) print("Number of speakers", nspk)
def recog(args): '''Run recognition''' # display chainer version logging.info('chainer version = ' + chainer.__version__) # seed setting (chainer seed may not need it) os.environ["CHAINER_SEED"] = str(args.seed) logging.info('chainer seed = ' + os.environ['CHAINER_SEED']) # read training config with open(args.model_conf, "rb") as f: logging.info('reading a model config file from' + args.model_conf) idim, odim, train_args = pickle.load(f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) chainer.serializers.load_npz(args.model, model) # read rnnlm if args.rnnlm: rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(train_args.char_list), 650)) chainer.serializers.load_npz(args.rnnlm, rnnlm) else: rnnlm = None if args.word_rnnlm: if not args.word_dict: logging.error( 'word dictionary file is not specified for the word RNNLM.') sys.exit(1) word_dict = load_labeldict(args.word_dict) char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(word_dict), 650)) chainer.serializers.load_npz(args.word_rnnlm, word_rnnlm) if rnnlm is not None: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # read json data with open(args.recog_json, 'rb') as f: recog_json = json.load(f)['utts'] new_json = {} for name in recog_json.keys(): feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat']) logging.info('decoding ' + name) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm) # get 1best and remove sos y_hat = nbest_hyps[0]['yseq'][1:] y_true = map(int, recog_json[name]['output'][0]['tokenid'].split()) # print out decoding result seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_true = [train_args.char_list[int(idx)] for idx in y_true] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') seq_true_text = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%s]: " + seq_true_text, name) logging.info("prediction [%s]: " + seq_hat_text, name) # copy old json info new_json[name] = dict() new_json[name]['utt2spk'] = recog_json[name]['utt2spk'] # add 1-best recognition results to json logging.debug("dump token id") out_dic = dict() for _key in recog_json[name]['output'][0]: out_dic[_key] = recog_json[name]['output'][0][_key] # TODO(karita) make consistent to chainer as idx[0] not idx out_dic['rec_tokenid'] = " ".join([str(idx[0]) for idx in y_hat]) logging.debug("dump token") out_dic['rec_token'] = " ".join(seq_hat) logging.debug("dump text") out_dic['rec_text'] = seq_hat_text new_json[name]['output'] = [out_dic] # TODO(nelson): Modify this part when saving more than 1 hyp is enabled # add n-best recognition results with scores if args.beam_size > 1 and len(nbest_hyps) > 1: for i, hyp in enumerate(nbest_hyps): y_hat = hyp['yseq'][1:] seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) + ']'] \ = " ".join([str(idx[0]) for idx in y_hat]) new_json[name]['rec_token' + '[' + '{:05d}'.format(i) + ']'] = " ".join(seq_hat) new_json[name]['rec_text' + '[' + '{:05d}'.format(i) + ']'] = seq_hat_text new_json[name]['score' + '[' + '{:05d}'.format(i) + ']'] = hyp['score'] # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_json }, indent=4, sort_keys=True).encode('utf_8'))
old_json_dir = 'dump/train_100/deltafalse/split_utt_spk' new_feats_scp = 'data/erep_train_100/feats.scp' new_json_dir = 'data/erep_train_100/json' if not exists(new_json_dir): os.makedirs(new_json_dir) print('Create feats dictionary...') feats_dict = {} with open(new_feats_scp) as f: for line in f.read().splitlines(): sp = line.split() feats_dict[sp[0]] = { u'path': sp[1], u'shape': kaldi_io_py.read_mat(sp[1]).shape } print('Done reading features!') print('Reading data jsons...') djsons = [x for x in os.listdir(old_json_dir) if x.endswith('.json')] for jsfile in djsons: print('Reading ' + jsfile) with open(join(old_json_dir, jsfile)) as f1, open(join(new_json_dir, jsfile), 'w') as f2: js = json.load(f1) utt_ids = js[u'utts'].keys() for k in utt_ids: js[u'utts'][k][u'input'][0][u'feat'] = feats_dict[k][u'path'] js[u'utts'][k][u'input'][0][u'shape'] = feats_dict[k][u'shape']
def recog(args): '''Run recognition''' # display chainer version logging.info('chainer version = ' + chainer.__version__) # seed setting (chainer seed may not need it) os.environ["CHAINER_SEED"] = str(args.seed) logging.info('chainer seed = ' + os.environ['CHAINER_SEED']) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) chainer_load(args.model, model) # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(train_args.char_list), rnnlm_args.unit)) chainer_load(args.rnnlm, rnnlm) else: rnnlm = None if args.word_rnnlm: if not args.word_dict: logging.error( 'word dictionary file is not specified for the word RNNLM.') sys.exit(1) rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf) word_dict = load_labeldict(args.word_dict) char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(word_dict), rnnlm_args.unit)) chainer_load(args.word_rnnlm, word_rnnlm) if rnnlm is not None: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # decode each utterance new_js = {} with chainer.no_backprop_mode(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def recog(args): '''Run recognition''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) torch_load(args.model, model) e2e.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # gpu if args.ngpu == 1: gpu_id = range(args.ngpu) logging.info('gpu id: ' + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: try: from itertools import zip_longest as zip_longest except Exception: from itertools import izip_longest as zip_longest def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data keys = list(js.keys()) feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] feats = [ kaldi_io_py.read_mat(js[name]['input'][0]['feat']) for name in names ] nbest_hyps = e2e.recognize_batch(feats, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp, train_args.char_list) # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def recog(args): '''Run recognition''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) torch_load(args.model, model) # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: if not args.word_dict: logging.error('word dictionary file is not specified for the word RNNLM.') sys.exit(1) rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf) word_dict = load_labeldict(args.word_dict) char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState(lm_pytorch.RNNLM(len(word_dict), rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # read json data with open(args.recog_json, 'rb') as f: recog_json = json.load(f)['utts'] new_json = {} with torch.no_grad(): for name in recog_json.keys(): feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm=rnnlm) # get 1best and remove sos y_hat = nbest_hyps[0]['yseq'][1:] y_true = map(int, recog_json[name]['output'][0]['tokenid'].split()) # print out decoding result seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_true = [train_args.char_list[int(idx)] for idx in y_true] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') seq_true_text = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%s]: " + seq_true_text, name) logging.info("prediction [%s]: " + seq_hat_text, name) # copy old json info new_json[name] = dict() new_json[name]['utt2spk'] = recog_json[name]['utt2spk'] # added recognition results to json logging.debug("dump token id") out_dic = dict() for _key in recog_json[name]['output'][0]: out_dic[_key] = recog_json[name]['output'][0][_key] # TODO(karita) make consistent to chainer as idx[0] not idx out_dic['rec_tokenid'] = " ".join([str(idx) for idx in y_hat]) logging.debug("dump token") out_dic['rec_token'] = " ".join(seq_hat) logging.debug("dump text") out_dic['rec_text'] = seq_hat_text new_json[name]['output'] = [out_dic] # TODO(nelson): Modify this part when saving more than 1 hyp is enabled # add n-best recognition results with scores if args.beam_size > 1 and len(nbest_hyps) > 1: for i, hyp in enumerate(nbest_hyps): y_hat = hyp['yseq'][1:] seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) + ']'] = \ " ".join([str(idx) for idx in y_hat]) new_json[name]['rec_token' + '[' + '{:05d}'.format(i) + ']'] = " ".join(seq_hat) new_json[name]['rec_text' + '[' + '{:05d}'.format(i) + ']'] = seq_hat_text new_json[name]['score' + '[' + '{:05d}'.format(i) + ']'] = hyp['score'] # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write(json.dumps({'utts': new_json}, indent=4, sort_keys=True).encode('utf_8'))
def encode(args): '''Get ASR encoded representations...probably for xvectors''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, odim_adv, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args, odim_adv=odim_adv) model = Loss(e2e, train_args.mtlalpha) if train_args.rnnlm is not None: # set rnnlm. external rnnlm is used for recognition. model.predictor.rnnlm = rnnlm torch_load(args.model, model) e2e.recog_args = args # gpu if args.ngpu == 1: gpu_id = range(args.ngpu) logging.info('gpu id: ' + str(gpu_id)) model.cuda() arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.feats_out, args.feats_out) if args.batchsize == 0: with torch.no_grad(): with kaldi_io_py.open_or_fd(arkscp, 'wb') as f, open(args.feats_in, 'rb') as f2: lines = f2.read().splitlines() for idx, line in enumerate(lines, 1): line = line.strip().split() name = line[0] logging.info('(%d/%d) decoding ' + name, idx, len(lines)) feat = kaldi_io_py.read_mat(line[1]) rep = e2e.erep(feat) logging.info('Rep shape: %s', rep.shape) kaldi_io_py.write_mat(f, rep, name) else: try: from itertools import zip_longest as zip_longest except Exception: from itertools import izip_longest as zip_longest def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # Create json object for batch processing logging.info("Creating json for batch processing...") js = {} with open(args.feats_in, 'rb') as f: lines = f.read().splitlines() for line in lines: line = line.strip().split() name = line[0] featpath = line[1] feat_shape = kaldi_io_py.read_mat(featpath).shape js[name] = {'feat': featpath, 'shape': feat_shape} # sort data logging.info("Sorting data for batch processing...") keys = list(js.keys()) feat_lens = [js[key]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] feats = [ kaldi_io_py.read_mat(js[name]['feat']) for name in names ] reps, replens = e2e.erep_batch(feats) print(reps.shape, replens) for i, rep in enumerate(reps): name = names[i] kaldi_io_py.write_mat(f, rep, name)
def recog(args): '''Run recognition''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, odim_adv, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim_adv, train_args) model = Loss(e2e) torch_load(args.model, model) e2e.recog_args = args # gpu if args.ngpu == 1: gpu_id = range(args.ngpu) logging.info('gpu id: ' + str(gpu_id)) model.cuda() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat) new_js[name] = add_results_to_json(js[name], nbest_hyps) else: try: from itertools import zip_longest as zip_longest except Exception: from itertools import izip_longest as zip_longest def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data keys = list(js.keys()) feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] feats = [ kaldi_io_py.read_mat(js[name]['input'][0]['feat']) for name in names ] nbest_hyps = e2e.recognize_batch(feats) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp) # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def recog(args): '''Run recognition''' # seed setting torch.manual_seed(args.seed) # read training config with open(args.model_conf, "rb") as f: logging.info('reading a model config file from' + args.model_conf) idim, odim, train_args = pickle.load(f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) def cpu_loader(storage, location): return storage def remove_dataparallel(state_dict): from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith("module."): k = k[7:] new_state_dict[k] = v return new_state_dict model.load_state_dict( remove_dataparallel(torch.load(args.model, map_location=cpu_loader))) # read rnnlm if args.rnnlm: rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), 650)) rnnlm.load_state_dict(torch.load(args.rnnlm, map_location=cpu_loader)) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: if not args.word_dict: logging.error( 'word dictionary file is not specified for the word RNNLM.') sys.exit(1) word_dict = load_labeldict(args.word_dict) char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), 650)) word_rnnlm.load_state_dict( torch.load(args.word_rnnlm, map_location=cpu_loader)) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # read json data with open(args.recog_json, 'rb') as f: recog_json = json.load(f)['utts'] if not torch_is_old: torch.set_grad_enabled(False) new_json = {} for name in recog_json.keys(): if args.input_tensor: feat = load_lua(recog_json[name]['input'][0]['feat']).numpy() else: feat = kaldi_io_py.read_mat(recog_json[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm=rnnlm) # get 1best and remove sos y_hat = nbest_hyps[0]['yseq'][1:] y_true = map(int, recog_json[name]['output'][0]['tokenid'].split()) # print out decoding result seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_true = [train_args.char_list[int(idx)] for idx in y_true] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') seq_true_text = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%s]: " + seq_true_text, name) logging.info("prediction [%s]: " + seq_hat_text, name) # copy old json info new_json[name] = dict() new_json[name]['utt2spk'] = recog_json[name]['utt2spk'] # added recognition results to json logging.debug("dump token id") out_dic = dict() for _key in recog_json[name]['output'][0]: out_dic[_key] = recog_json[name]['output'][0][_key] # TODO(karita) make consistent to chainer as idx[0] not idx out_dic['rec_tokenid'] = " ".join([str(idx) for idx in y_hat]) logging.debug("dump token") out_dic['rec_token'] = " ".join(seq_hat) logging.debug("dump text") out_dic['rec_text'] = seq_hat_text new_json[name]['output'] = [out_dic] # TODO(nelson): Modify this part when saving more than 1 hyp is enabled # add n-best recognition results with scores if args.beam_size > 1 and len(nbest_hyps) > 1: for i, hyp in enumerate(nbest_hyps): y_hat = hyp['yseq'][1:] seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') new_json[name]['rec_tokenid' + '[' + '{:05d}'.format(i) + ']'] = " ".join([str(idx) for idx in y_hat]) new_json[name]['rec_token' + '[' + '{:05d}'.format(i) + ']'] = " ".join(seq_hat) new_json[name]['rec_text' + '[' + '{:05d}'.format(i) + ']'] = seq_hat_text new_json[name]['score' + '[' + '{:05d}'.format(i) + ']'] = hyp['score'] # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_json }, indent=4, sort_keys=True).encode('utf_8'))
def recog(args): '''Run recognition''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args) model = Loss(e2e, train_args.mtlalpha) torch_load(args.model, model) # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: if not args.word_dict: logging.error( 'word dictionary file is not specified for the word RNNLM.') sys.exit(1) rnnlm_args = get_model_conf(args.word_rnnlm, args.rnnlm_conf) word_dict = load_labeldict(args.word_dict) char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) feat = kaldi_io_py.read_mat(js[name]['input'][0]['feat']) nbest_hyps = e2e.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) # TODO(watanabe) fix character coding problems when saving it with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def __getitem__(self, item): return read_mat(self.loader_dict[item.decode('utf-8')])