for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): # x = x.to("cuda") x = x.to(device) outs, out_lens = model(x, x_lens) outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset : offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) model.train() return cer if __name__ == "__main__": with open("./labels.json") as f: vocabulary = json.load(f) vocabulary = "".join(vocabulary) model = GatedConv(vocabulary) if(not os.path.exists(save_path)): os.mkdir(save_path) model.to(device) train(model)
import torch import feature from models.conv import GatedConv import torch.nn.functional as F from ctcdecode import CTCBeamDecoder from config import lm_path, pretrained_model_path alpha = 0.8 beta = 0.3 cutoff_top_n = 40 cutoff_prob = 1.0 beam_width = 32 num_processes = 4 blank_index = 0 model = GatedConv.load(pretrained_model_path) model.eval() decoder = CTCBeamDecoder( model.vocabulary, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, )
import _init_path import platform from models.conv import GatedConv use_lm = True if use_lm: import beamdecode system_type = platform.system() if (system_type == 'Windows'): model = GatedConv.load("AboutDL\\语音识别MASR\\pretrained\\gated-conv.pth") #import scipy #_,receipt_data = scipy.io.wavfile.read("E:\\打开欢呼比.wav") #text = model.predict(receipt_data)事实证明效果相同 text = model.predict("E:\\打开欢呼比.wav") elif (system_type == 'Linux'): model = GatedConv.load('AboutDL/语音识别MASR/pretrained/gated-conv.pth') text = model.predict( "/media/yangjinming/DATA/Dataset/PrimeWords/d/d2/d25104a2-6be0-4950-9ec0-42e8e1303492.wav" ) print("识别结果:", text)
print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): x = x.to(device) outs, out_lens = model(x, x_lens) outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset:offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) model.train() return cer if __name__ == "__main__": model = GatedConv( json.load(open("../data_aishell/labels.json", encoding='utf-8'))) epoch = 40 model.load_state_dict(torch.load("pretrained/model_{}.pth".format(epoch))) print("reload model: pretrained/model_{}.pth".format(epoch)) model.to(device) train(model, start_epoch=epoch)
import _init_path from models.conv import GatedConv import json # model = GatedConv.load("pretrained/gated-conv.pth") # model.to_train() # model.fit("train.manifest", "train.manifest") with open("data_aishell/labels.json") as f: vocabulary = json.load(f) vocabulary = "".join(vocabulary) model = GatedConv(vocabulary) model.to_train() model.fit("/home/dolan/Desktop/masr/data_aishell/train.index", "/home/dolan/Desktop/masr/data_aishell/dev.index", "/home/dolan/Desktop/masr/data_aishell/labels.json", 10)
import _init_path from models.conv import GatedConv import pre_transform_2 as pt2 import enhance_speach as es # import beamdecode model = GatedConv.load("pretrained/gated-conv.pth") # model = GatedConv.load("pretrained/model_110.pth") # with open("train_index","w") as f: # for filename in os.listdir('data/'): # # print(filename) # s1 = filename[:8] # s2 = "" # for x in s1: # s2 += trans[x] # s1 = "data/" + s1 +".wav" # f.write(s1+","+s2+"\n") # text = model.predict("test.wav") # text = model.predict("12345_man.wav") # text = model.predict("678910_man.wav") # text = model.predict("862409_in.wav") # text = model.predict("20164239_kuai.wav") # text = model.predict("20164762_kuai.wav") # text = model.predict("20164762_man.wav") # text = model.predict("data/20166565.wav") # text = model.predict("20164786.wav") input_file_src = "record.wav" # output_file_src = "record_out.wav" # es.denoise(input_file_src,output_file_src)
# import _init_path from models.conv import GatedConv # model = GatedConv.load("pretrained/gated-conv.pth") # model = GatedConv.load("pretrained/model_62.pth") model = GatedConv.load("pretrained2/model_81.pth") text = model.predict("./sample_audio/8_16.wav") print("") print("识别结果:") print(text)
import _init_path from models.conv import GatedConv model = GatedConv.load("语音识别MASR/pretrained/gated-conv.pth") model.to_train() model.fit("data/train.index", "data/dev.index", train_batch_size=2)
__mtime__ = '20210318' import os from models.conv import GatedConv from config import pretrained_model_path model = GatedConv.load(os.path.join('..', pretrained_model_path)) text = model.predict("../data_aishell/BAC009S0765W0130.wav") print("") print("识别结果:") print(text)
cer = 0 print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): x = x.to("cuda") outs, out_lens = model(x, x_lens) outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset : offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) model.train() return cer if __name__ == "__main__": with open("data_aishell/labels.json") as f: vocabulary = json.load(f) vocabulary = "".join(vocabulary) model = GatedConv(vocabulary) model.to("cuda") train(model)
import torch import feature from models.conv import GatedConv import torch.nn.functional as F from ctcdecode import CTCBeamDecoder alpha = 0.8 beta = 0.3 lm_path = "/home/db/bing/yuyingshibie/masr/lm/zh_giga.no_cna_cmn.prune01244.klm" cutoff_top_n = 40 cutoff_prob = 1.0 beam_width = 32 num_processes = 4 blank_index = 0 model = GatedConv.load( "/home/db/bing/yuyingshibie/masr/pretrained/gated-conv.pth") model.eval() decoder = CTCBeamDecoder( model.vocabulary, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, )
import _init_path from models.conv import GatedConv # model = GatedConv.load("pretrained/gated-conv.pth") model = GatedConv.load("pretrained/model_3.pth") text = model.predict("./sample_audio/test.wav") print("") print("识别结果:") print(text)
parser.add_argument('--lm-alpha-from', default=0.0, type=float, help='Language model weight start tuning') parser.add_argument('--lm-alpha-to', default=3.0, type=float, help='Language model weight end tuning') parser.add_argument('--lm-beta-from', default=0.0, type=float, help='Language model word bonus (all words) start tuning') parser.add_argument('--lm-beta-to', default=0.5, type=float, help='Language model word bonus (all words) end tuning') parser.add_argument('--lm-num-alphas', default=45, type=float, help='Number of alpha candidates for tuning') parser.add_argument('--lm-num-betas', default=5, type=float, help='Number of beta candidates for tuning') parser = add_decoder_args(parser) args = parser.parse_args() if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = GatedConv.load(args.model_path) saved_output = np.load(args.saved_output, allow_pickle=True) def init(beam_width, blank_index, lm_path): global decoder, ae_decoder decoder = BeamCTCDecoder(model.vocabulary, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index) ae_decoder = GreedyDecoder(model.vocabulary) def decode_dataset(params): lm_alpha, lm_beta = params global decoder decoder._decoder.reset_params(lm_alpha, lm_beta)