示例#1
0
def recognize(args):
    #import pdb
    #pdb.set_trace()
    char_list, sos_id, eos_id = process_dict(args.dict)
    args.char_list = char_list
    model = Seq2Seq.load_model(args.model_path, args)
    print(model)
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']

    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print('(%d/%d) decoding %s' % (idx, len(js.keys()), name),
                  flush=True)
            input = kaldi_io.read_mat(js[name]['input'][0]['feat'])  # TxD
            input = torch.from_numpy(input).float()
            input_length = torch.tensor([input.size(0)], dtype=torch.int)
            input = input.cuda()
            input_length = input_length.cuda()
            nbest_hyps = model.recognize(input, input_length, char_list, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
示例#2
0
def recognize(args):
    # model
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    encoder = Encoder(
        args.d_input * args.LFR_m,
        args.n_layers_enc,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        pe_maxlen=args.pe_maxlen,
    )
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen,
    )
    model = Transformer(encoder, decoder)
    model.load_state_dict(flow.load(args.model_path))
    device = flow.device("cuda")
    model.eval()
    model.to(device)
    LFR_m = args.LFR_m
    LFR_n = args.LFR_n
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]

    # decode each utterance
    new_js = {}
    with flow.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print("(%d/%d) decoding %s" % (idx, len(js.keys()), name), flush=True)
            input = kaldi_io.read_mat(js[name]["input"][0]["feat"])
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = flow.tensor(input).to(dtype=flow.float32)
            input_length = flow.tensor([input.size(0)], dtype=flow.int64)
            input = input.to(device)
            input_length = input_length.to(device)
            nbest_hyps = model.recognize(input, input_length, char_list, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, "wb") as f:
        f.write(json.dumps({"utts": new_js}, indent=4, sort_keys=True).encode("utf_8"))
示例#3
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path)
    print(model)
    model.eval()
    # model.cuda()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']

    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print('(%d/%d) decoding %s' % (idx, len(js.keys()), name),
                  flush=True)
            input = kaldi_io.read_mat(js[name]['input'][0]['feat'])  # TxD
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = torch.from_numpy(input).float()
            input_length = torch.tensor([input.size(0)], dtype=torch.int)
            # input = input.cuda()
            # input_length = input_length.cuda()
            input = input.to(device)
            input_length = input_length.to(device)
            nbest_hyps = model.recognize(input, input_length, char_list, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
示例#4
0
文件: train.py 项目: JJoving/SMLAT
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    encoder = Encoder(args.einput,
                      args.ehidden,
                      args.elayer,
                      dropout=args.edropout,
                      bidirectional=args.ebidirectional,
                      rnn_type=args.etype)
    decoder = Decoder(vocab_size,
                      args.dembed,
                      sos_id,
                      eos_id,
                      args.dhidden,
                      args.dlayer,
                      bidirectional_encoder=args.ebidirectional)
    model = Seq2Seq(encoder, decoder)
    print(model)
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    ctc = 0
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#5
0
 def __init__(self, traincfg):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     self.base_path = dir_path
     self.recog_json = os.path.join(dir_path, self.recog_json)
     self.dict_txt = os.path.join(dir_path, self.dict_txt)
     # Construct the model
     self.model, self.LFR_m, self.LFR_n = Transformer(
         traincfg.encoder, traincfg.decoder), traincfg.LFR_m, traincfg.LFR_n
     self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt)
     assert self.model.decoder.sos_id == self.sos_id and self.model.decoder.eos_id == self.eos_id
     # Read json data
     with open(self.recog_json, "rb") as f:
         self.js = json.load(f)['utts']
示例#6
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path, args)
    print(model)
    # device = torch.device("cuda:0")
    # model = model.to(device)
    # model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, 'r', encoding='utf-8') as f:
        js = json.load(f)['utts']

    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print('(%d/%d) decoding %s' % (idx, len(js.keys()), name),
                  flush=True)
            input = kaldi_io.read_mat(js[name]['input'][0]['feat'])  # TxD
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = torch.from_numpy(input).float()
            input_length = torch.tensor([input.size(0)], dtype=torch.int)
            input = input.cuda()
            input_length = input_length.cuda()
            nbest_hyps = model.recognize(input, input_length, char_list, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)
            # new_js[name]['output'][0]['rec_text'] = new_js[name]['output'][0]['rec_text'].encode('utf_8')
            # new_js[name]['output'][0]['rec_token'] = new_js[name]['output'][0]['rec_token'].encode('utf_8')
            # new_js[name]['output'][0]['text'] = new_js[name]['output'][0]['text'].encode('utf_8')
            # new_js[name]['output'][0]['token'] = new_js[name]['output'][0]['token'].encode('utf_8')
            # print(new_js[name])

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            },
                       indent=4,
                       sort_keys=True,
                       ensure_ascii=False).encode('utf_8'))
示例#7
0
train_x, train_desc, train_att = getTrain_Img_Desc_Att(
    '../images/train', '../word_c10/train', '../att_per_classes.npy')

iter_ = data_iterator(train_x, train_desc, train_att, device)

#test visual features
val_x, val_pre_proto, val_pre_att_proto, val_x2label, val_proto2label = getTest_Img_proto_labels(
    '../images/val', '../word_c10/val', '../att_per_classes.npy')

#NOTE: Here Vocab starts from 1 where 1 being '<END>'
vocab = torchfile.load('../vocab_c10.t7', force_8bytes_long=True)
word2idx_vocab = {str(key, 'utf-8'): value for key, value in vocab.items()}
print('Vocabs Loaded: Total words in vocabulary including pad....{}'.format(
    len(word2idx_vocab)))
#Gives all words in glove's vocab to its glove embedding matrix (dictionary contains 0 as'<PAD>')
glove_dict = process_dict('../glove.6B.50d.txt', dim=input_dim)

print('Loading embedding layer....')
embed_layer = getEmbed_layer(word2idx=word2idx_vocab,
                             glove_dict=glove_dict,
                             dim=input_dim).to(device)
print('Loaded Embedding!!')

w1 = Variable(torch.FloatTensor(512, 700).to(device), requires_grad=True)
b1 = Variable(torch.FloatTensor(700).to(device), requires_grad=True)
w2 = Variable(torch.FloatTensor(700, 1024).to(device), requires_grad=True)
b2 = Variable(torch.FloatTensor(1024).to(device), requires_grad=True)
w3 = Variable(torch.FloatTensor(312, 700).to(device), requires_grad=True)
b3 = Variable(torch.FloatTensor(700).to(device), requires_grad=True)

# must initialize!
示例#8
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json,
                              args.batch_size,
                              args.maxlen_in,
                              args.maxlen_out,
                              batch_frames=args.batch_frames)
    cv_dataset = AudioDataset(args.valid_json,
                              args.batch_size,
                              args.maxlen_in,
                              args.maxlen_out,
                              batch_frames=args.batch_frames)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                shuffle=args.shuffle,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    encoder = Encoder(args.d_input * args.LFR_m,
                      args.n_layers_enc,
                      args.n_head,
                      args.d_k,
                      args.d_v,
                      args.d_model,
                      args.d_inner,
                      dropout=args.dropout,
                      pe_maxlen=args.pe_maxlen)
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen)
    model = Transformer(encoder, decoder)
    print(model)
    model.cuda()
    # optimizer
    model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    optimizier = TransformerOptimizer(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        args.k, args.d_model, args.warmup_steps)

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#9
0
    parser.add_argument('json', type=str, help='json files')
    parser.add_argument('dict', type=str, help='dict')
    parser.add_argument('ref', type=str, help='ref')
    parser.add_argument('hyp', type=str, help='hyp')
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")

    logging.info("reading %s", args.json)
    with open(args.json, 'r') as f:
        j = json.load(f)

    logging.info("reading %s", args.dict)
    char_list, sos_id, eos_id = process_dict(args.dict)
    # with open(args.dict, 'r') as f:
    #     dictionary = f.readlines()
    # char_list = [unicode(entry.split(' ')[0], 'utf_8') for entry in dictionary]
    # char_list.insert(0, '<blank>')
    # char_list.append('<eos>')
    # print([x.encode('utf-8') for x in char_list])

    logging.info("writing hyp trn to %s", args.hyp)
    logging.info("writing ref trn to %s", args.ref)
    h = open(args.hyp, 'w')
    r = open(args.ref, 'w')

    for x in j['utts']:
        seq = [char_list[int(i)] for i in j['utts'][x]
               ['output'][0]['rec_tokenid'].split()]
示例#10
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    args.char_list = char_list
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    #import pdb
    #pdb.set_trace()
    encoder = Encoder(args.einput * args.LFR_m,
                      args.ehidden,
                      args.elayer,
                      vocab_size,
                      dropout=args.edropout,
                      bidirectional=args.ebidirectional,
                      rnn_type=args.etype)
    decoder = Decoder(vocab_size,
                      args.dembed,
                      sos_id,
                      eos_id,
                      args.dhidden,
                      args.dlayer,
                      args.offset,
                      args.atype,
                      dropout=args.edropout,
                      bidirectional_encoder=args.ebidirectional)
    if args.ebidirectional:
        eprojs = args.ehidden * 2
    else:
        eprojs = args.ehidden
    ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout)
    #lstm_model = Lstmctc.load_model(args.continue_from)

    model = Seq2Seq(encoder, decoder, ctc, args)
    #model_dict = model.state_dict()
    print(model)
    #print(lstm_model)
    #pretrained_dict = torch.load(args.ctc_model)
    #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict}
    #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict}
    #model_dict.update(pretrained_dict)
    #model.load_state_dict(model_dict)
    #for k,v in model.named_parameters():
    #    if k.startswith("encoder"):
    #        print(k)
    #        v.requires_grad=False
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    ctc = 0
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#11
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path)
    print(model)
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id
    tr_dataset = AudioDataset('test', args.batch_size)
    path_list = tr_dataset.path_lst
    label_list = tr_dataset.han_lst
    num_data = tr_dataset.path_count
    ran_num = random.randint(0, num_data - 1)

    num = args.count
    words_num = 0
    word_error_num = 0
    seq_error = 0
    data = ''
    with torch.no_grad():
        for index in range(num):
            try:
                print('\nthe ', index + 1, 'th example.')
                data += 'the ' + str(index + 1) + 'th example.\n'
                index = (ran_num + index) % num_data
                standard_label = label_list[index]
                feature, label = get_fbank_and_hanzi_data(
                    index, args.feature_dim, char_list, path_list, label_list)
                if len(feature) > 1600:
                    continue
                input = build_LFR_features(feature, args.LFR_m, args.LFR_n)
                input = torch.from_numpy(input).float()
                input_length = torch.tensor([input.size(0)], dtype=torch.int)
                input = input.cuda()
                nbest_hyps = model.recognize(input, input_length, char_list,
                                             args)
                pred_label = nbest_hyps[0]['yseq'][1:-1]
                pred_res = ''.join([char_list[index] for index in pred_label])
                print("stand:", label)
                print("pred :", pred_label)
                data += "stand:" + str(standard_label) + '\n'
                data += "pred :" + str(pred_res) + '\n'
                words_n = len(label)
                words_num += words_n
                word_distance = GetEditDistance(pred_label, label)
                if (word_distance <= words_n):
                    word_error_num += word_distance
                else:
                    word_error_num += words_n

                if pred_label != label:
                    seq_error += 1
            except ValueError:
                continue
    print('WER = ', (1 - word_error_num / words_num) * 100, '%')
    print('CER = ', (1 - seq_error / args.count) * 100, '%')
    data += 'WER = ' + str((1 - word_error_num / words_num) * 100) + '%'
    data += 'CER = ' + str((1 - seq_error / args.count) * 100) + '%'
    with open('../../model_log/pred/test_' + str(args.count) + '.txt',
              'w',
              encoding='utf-8') as f:
        f.writelines(data)
示例#12
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(
        args.train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        batch_frames=args.batch_frames,
    )
    cv_dataset = AudioDataset(
        args.valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        batch_frames=args.batch_frames,
    )
    tr_loader = AudioDataLoader(
        tr_dataset,
        batch_size=1,
        num_workers=args.num_workers,
        shuffle=args.shuffle,
        LFR_m=args.LFR_m,
        LFR_n=args.LFR_n,
    )
    cv_loader = AudioDataLoader(
        cv_dataset,
        batch_size=1,
        num_workers=args.num_workers,
        LFR_m=args.LFR_m,
        LFR_n=args.LFR_n,
    )

    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {"tr_loader": tr_loader, "cv_loader": cv_loader}

    # model
    encoder = Encoder(
        args.d_input * args.LFR_m,
        args.n_layers_enc,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        pe_maxlen=args.pe_maxlen,
    )
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen,
    )
    model = Transformer(encoder, decoder)

    device = flow.device("cuda")
    model.to(device)

    # optimizer
    optimizier = TransformerOptimizer(
        flow.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        args.k,
        args.d_model,
        args.warmup_steps,
        args.step_num,
    )

    # solver
    solver = Solver(data, model, optimizier, device, args)
    solver.train()
示例#13
0
 def __init__(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     self.train_json = os.path.join(dir_path, self.train_json)
     self.valid_json = os.path.join(dir_path, self.valid_json)
     self.dict_txt = os.path.join(dir_path, self.dict_txt)
     self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt)
     self.vocab_size = len(self.char_list)
     self.tr_dataset = AudioDataset(self.train_json,
                                    self.batch_size,
                                    self.maxlen_in,
                                    self.maxlen_out,
                                    batch_frames=self.batch_frames)
     self.cv_dataset = AudioDataset(self.valid_json,
                                    self.batch_size,
                                    self.maxlen_in,
                                    self.maxlen_out,
                                    batch_frames=self.batch_frames)
     self.tr_loader = AudioDataLoader(self.tr_dataset,
                                      batch_size=1,
                                      num_workers=self.num_workers,
                                      shuffle=self.shuffle,
                                      LFR_m=self.LFR_m,
                                      LFR_n=self.LFR_n)
     self.cv_loader = AudioDataLoader(self.cv_dataset,
                                      batch_size=1,
                                      num_workers=self.num_workers,
                                      LFR_m=self.LFR_m,
                                      LFR_n=self.LFR_n)
     self.data = {'tr_loader': self.tr_loader, 'cv_loader': self.cv_loader}
     self.encoder = Encoder(self.d_input * self.LFR_m,
                            self.n_layers_enc,
                            self.n_head,
                            self.d_k,
                            self.d_v,
                            self.d_model,
                            self.d_inner,
                            dropout=self.dropout,
                            pe_maxlen=self.pe_maxlen)
     self.decoder = Decoder(
         self.sos_id,
         self.eos_id,
         self.vocab_size,
         self.d_word_vec,
         self.n_layers_dec,
         self.n_head,
         self.d_k,
         self.d_v,
         self.d_model,
         self.d_inner,
         dropout=self.dropout,
         tgt_emb_prj_weight_sharing=self.tgt_emb_prj_weight_sharing,
         pe_maxlen=self.pe_maxlen)
     self.tr_loss = torch.Tensor(self.epochs)
     self.cv_loss = torch.Tensor(self.epochs)
     self.model = Transformer(self.encoder, self.decoder)
     self.optimizer = TransformerOptimizer(
         torch.optim.Adam(self.model.parameters(),
                          betas=(0.9, 0.98),
                          eps=1e-09), self.k, self.d_model,
         self.warmup_steps)
     self._reset()