def gen_post(feat_list, model, skip_frames): model.eval( ) # Put the model in test mode (the opposite of model.train(), essentially) m, v = read_mv(stat_file) if m is None or v is None: raise Exception("mean or variance vector does not exist") with open(feat_list) as f: for line in f: line = line.strip() if len(line) < 1: continue print("generating features for file", line) io = htk_io.fopen(line) utt_feat = io.getall() utt_feat -= m # normalize mean utt_feat /= (np.sqrt(v) + eps) # normalize var feat_numpy = org_data(utt_feat, skip_frames) feat_tensor = torch.from_numpy(feat_numpy).type(gpu_dtype) x = Variable(feat_tensor.type(gpu_dtype), volatile=True) input_size_list = [x.size(1)] # number of time steps x = nn.utils.rnn.pack_padded_sequence(x, input_size_list, batch_first=True) out_feat = model(x, input_size_list) out_feat_numpy = out_feat.data.cpu().numpy() out_feat_numpy = dct(out_feat_numpy, type=2, axis=1, norm='ortho')[:, 1:numcep + 1] out_feat_delta = delta(out_feat_numpy, 2) out_feat_ddelta = delta(out_feat_delta, 2) out_feat_numpy = np.concatenate( (out_feat_numpy, out_feat_delta, out_feat_ddelta), axis=1) out_file = line.replace(".fea", ".mfc") io = htk_io.fopen(out_file, mode="wb", veclen=out_feat_numpy.shape[1]) io.writeall(out_feat_numpy) print("features saved in %s\n" % out_file)
def get_embed(self, wav_file, cfg_file): htk_feat_file = wav_file[:-4] + '.htk' gpu_dtype = torch.FloatTensor if self._HCopy(cfg_file, wav_file) is not None: io_src = htk_io.fopen(htk_feat_file) utt_feat = io_src.getall() num_frms = utt_feat.shape[0] utt_feat -= self.global_mean_emr utt_feat /= (np.sqrt(self.global_var_emr) + 1e-8) utt_feat = torch.FloatTensor(utt_feat[np.newaxis, :, :]) with torch.no_grad(): utt_feat = Variable(utt_feat).type(gpu_dtype) x = utt_feat.cuda() for i in range(len(self.rnns)): x = self.rnns[i](x, [num_frms]) x = self.batch_norm(x, [num_frms]) x_embed = self.fc(x) x_cls = self.cls(x_embed) x_prob = nn.Softmax(dim=2)(x_cls) x_label = x_prob.max(2)[1].data.cpu().numpy() x_prob = np.squeeze(x_prob.data.cpu().numpy()) x_embed = np.squeeze(x_embed.data.cpu().numpy()) x_ave = torch.cat( [torch.mean(x[0, 0:num_frms, :], dim=0, keepdim=True)], dim=0) x_embed_g = self.fc(x_ave) x_cls_g = self.cls(x_embed_g) x_prob_g = nn.Softmax(dim=1)(x_cls_g) x_label_g = x_prob_g.max(1)[1].data.cpu().numpy() x_prob_g = np.squeeze(x_prob_g.data.cpu().numpy()) x_embed_g = np.squeeze(x_embed_g.data.cpu().numpy()) return x_prob, x_embed, x_prob_g, x_embed_g
def proc_frame(feat_list, skip_frames=0): if not os.path.exists(out_folder_base): os.makedirs(out_folder_base) m_src, v_src = read_mv(stat_file_src) if m_src is None or v_src is None: raise Exception( "mean or variance vector for the source features does not exist") m_tgt, v_tgt = read_mv(stat_file_tgt) if m_tgt is None or v_tgt is None: raise Exception( "mean or variance vector for the target features does not exist") utt_count = 0 chunk_idx = -1 data_cache = [] buffer_len = 0 f = open(feat_list, 'r') while True: if buffer_len < buffer_seq: line = f.readline() if line == '': print('All utterances processed') f.close() break line = line.strip() if len(line) < 1: continue line_split = line.split() if len(line_split) == 2: src_feat_file, tgt_feat_file = line_split else: raise Exception("target feat file missing") io_tgt = htk_io.fopen(tgt_feat_file) utt_feat_tgt = io_tgt.getall() frm_num_tgt, feat_dim_tgt = utt_feat_tgt.shape utt_feat_tgt -= m_tgt # mean normalization utt_feat_tgt /= (np.sqrt(v_tgt) + eps) # var normalization io_src = htk_io.fopen(src_feat_file) utt_feat_src = io_src.getall() frm_num_src, feat_dim_src = utt_feat_src.shape if frm_num_src > frm_num_tgt: utt_feat_src = utt_feat_src[:frm_num_tgt] print("%d source frames, match to %d target frames" % (frm_num_src, frm_num_tgt)) utt_feat_src -= m_src # mean normalization utt_feat_src /= (np.sqrt(v_src) + eps) # var normalization if skip_frames > 0: utt_feat_src = np.pad(utt_feat_src, ((0, skip_frames), (0, 0)), mode='edge') # pad the ending frames utt_feat_src = utt_feat_src[skip_frames:, :] data_cache.append((utt_feat_src, utt_feat_tgt)) # fill the buffer buffer_len += 1 print("Processed %d of %d frames for file %s" % (utt_feat_src.shape[0], frm_num_src, src_feat_file)) utt_count += 1 print(utt_count) else: # output to hard drive chunk_idx += 1 print('Saving data chunk %d...' % chunk_idx) out_file = out_folder_base + '/' + str(chunk_idx) + '.h5' data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) ### while buffer_len > 0: chunk_idx += 1 print('Saving remaining data chunk %d...' % chunk_idx) out_file = out_folder_base + '/' + str(chunk_idx) + '.h5' if buffer_len > chunk_seq: data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) else: save_hd5(out_file, data_cache) buffer_len = 0
def proc_frame(feat_list): if not os.path.exists(out_folder_base): os.makedirs(out_folder_base) m_src, v_src = read_mv(stat_file_src) if m_src is None or v_src is None: raise Exception( "mean or variance vector for the source features does not exist") m_tgt, v_tgt = read_mv(stat_file_tgt) if m_tgt is None or v_tgt is None: raise Exception( "mean or variance vector for the target features does not exist") utt_count = 0 chunk_idx = -1 data_cache = [] buffer_len = 0 f = open(feat_list, 'r') while True: if buffer_len < buffer_num_frms: line = f.readline() if line == '': print('All utterances processed') f.close() break line = line.strip() if len(line) < 1: continue line_split = line.split() if len(line_split) == 2: src_feat_file, tgt_feat_file = line_split else: raise Exception("target feat file missing") io_tgt = htk_io.fopen(tgt_feat_file) utt_feat_tgt = io_tgt.getall() frm_num_tgt, feat_dim_tgt = utt_feat_tgt.shape utt_feat_tgt -= m_tgt # mean normalization utt_feat_tgt /= (np.sqrt(v_tgt) + eps) # var normalization io_src = htk_io.fopen(src_feat_file) utt_feat_src = io_src.getall() frm_num_src, feat_dim_src = utt_feat_src.shape if frm_num_src > frm_num_tgt: print("%d source frames, match to %d target frames" % (frm_num_src, frm_num_tgt)) utt_feat_src -= m_src # mean normalization utt_feat_src /= (np.sqrt(v_src) + eps) # var normalization utt_feat_src = np.pad( utt_feat_src, ((win_size_before, win_size_after), (0, 0)), mode='edge') # pad the starting and ending frames start = win_size_before end = frm_num_tgt + win_size_before count = 0 for i in range(start, end): # process one utterance count += 1 block_data = None block_data = utt_feat_src[i - win_size_before:i + win_size_after + 1, :] block_data = block_data.T block_data = block_data.reshape(1, block_data.shape[0], block_data.shape[1]) label = utt_feat_tgt[i - win_size_before] data_cache.append((block_data, label)) # fill the buffer buffer_len += 1 if count != frm_num_tgt: raise Exception( "The number of processed frames %d should equal the number of frames %d in the target utterance" % (count, frm_num_tgt)) else: print("Processed %d of %d frames for file %s" % (count, frm_num_tgt, src_feat_file)) utt_count += 1 print(utt_count) else: # output to hard drive chunk_idx += 1 print('Saving data chunk %d...' % chunk_idx) out_file = out_folder_base + '/' + str(chunk_idx) + '.h5' data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) ### while buffer_len > 0: chunk_idx += 1 print('Saving remaining data chunk %d...' % chunk_idx) out_file = out_folder_base + '/' + str(chunk_idx) + '.h5' if buffer_len > chunk_num_frms: data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) else: save_hd5(out_file, data_cache) buffer_len = 0
def proc_seq(mlf_dict, feat_list, out_folder, skip_frames=5): if not os.path.exists(out_folder): os.makedirs(out_folder) m, v = read_mv(stat_file) if m is None or v is None: raise Exception("mean or variance vector does not exist") utt_count = 0 chunk_idx = -1 data_cache = [] buffer_len = 0 f = open(feat_list, 'r') while True: if buffer_len < buffer_seq: line = f.readline() if line == '': print('All utterances processed') f.close() break line = line.strip() if len(line) < 1: continue filename_key = line.split('/')[-1] label = mlf_dict[filename_key] io_src = htk_io.fopen(line) utt_feat_src = io_src.getall() frm_num_src, feat_dim_src = utt_feat_src.shape utt_feat_src -= m # mean normalization utt_feat_src /= (np.sqrt(v) + eps) # var normalization if skip_frames > 0: utt_feat_src = np.pad(utt_feat_src, ((0, skip_frames), (0, 0)), mode='edge') # pad the ending frames utt_feat_src = utt_feat_src[skip_frames:, :] data_cache.append((utt_feat_src, label)) # fill the buffer buffer_len += 1 print("Processed %d frames for file %s" % (utt_feat_src.shape[0], filename_key)) mlf_dict.pop(filename_key) utt_count += 1 print(utt_count) else: # output to hard drive chunk_idx += 1 print('Saving data chunk %d...' % chunk_idx) out_file = out_folder + '/' + str(chunk_idx) + '.h5' data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) while buffer_len > 0: chunk_idx += 1 print('Saving remaining data chunk %d...' % chunk_idx) out_file = out_folder + '/' + str(chunk_idx) + '.h5' if buffer_len > chunk_seq: data_cache, buffer_len = make_chunk(out_file, data_cache, buffer_len) else: save_hd5(out_file, data_cache) buffer_len = 0
def gen_decoded(feat_list, model_path): model = set_model_ctc.Layered_RNN( rnn_input_size=40, nb_layers=layers, rnn_hidden_size=hidden_size, bidirectional=True if num_dirs == 2 else False, batch_norm=True, num_classes=61) model = model.type(gpu_dtype) model.load_state_dict(torch.load(model_path)) # load model params model.eval( ) # Put the model in test mode (the opposite of model.train(), essentially) if decoder_type == 'Greedy': labels = create_mapping(mapping_file) decoder = ctc_decode.GreedyDecoder_test( labels, output='char', space_idx=-1) # setup greedy decoder if decoder_type == 'Beam': labels = create_mapping(mapping_file) scorer = Scorer() decoder = ctc_decode.BeamDecoder_test( labels, scorer, top_paths=1, beam_width=200, output='char', space_idx=-1) # setup beam decoder without lm if decoder_type == 'Beam_LM': labels_symbol = '_123456789abcde~-hij,.|{ofg?!+u}[x]@ABCDEFGHIJKLMNOPQRSTUVWXYZ' labels_true = create_mapping(mapping_file) # need to use the fake symbols here for consistency with the trie scorer = KenLMScorer(labels_symbol, kenlm_path, trie_path, blank_index=0, space_index=-1) scorer.set_lm_weight(lm_weight) scorer.set_word_weight(lm_beta1) scorer.set_valid_word_weight(lm_beta2) # need to use the true timit label to convert the decoded position indexes back to phone labels decoder = ctc_decode.BeamDecoder_test( labels_true, scorer, top_paths=1, beam_width=200, output='char', space_idx=-1) # setup beam decoder with lm m, v = read_mv(stat_file) if m is None or v is None: raise Exception("mean or variance vector does not exist") with open(feat_list) as f: with open(out_mlf, 'w') as fw: fw.write('#!MLF!#\n') for line in f: line = line.strip() if len(line) < 1: continue print("recognizing file %s" % line) out_name = '"' + line[:line.rfind('.')] + '.rec' + '"' fw.write(out_name + '\n') io = htk_io.fopen(line) utt_feat = io.getall() utt_feat -= m # normalize mean utt_feat /= (np.sqrt(v) + eps) # normalize var feat_numpy = org_data(utt_feat, skip_frames=5) feat_tensor = torch.from_numpy(feat_numpy).type(gpu_dtype) x = Variable(feat_tensor.type(gpu_dtype), volatile=True) input_sizes_list = [x.size(1)] x = nn.utils.rnn.pack_padded_sequence(x, input_sizes_list, batch_first=True) probs = model(x, input_sizes_list) probs = probs.data.cpu() decoded = decoder.decode(probs, input_sizes_list)[0] for word in decoded: fw.write(word + '\n') fw.write('.\n') print(' '.join(decoded))