def load_model_from_package(cls, package, args): encoder = Encoder(package['einput'], package['ehidden'], package['elayer'], package['eprojection'], dropout=package['edropout'], bidirectional=package['ebidirectional'], rnn_type=package['etype']) decoder = Decoder(package['dvocab_size'], package['dembed'], package['dsos_id'], package['deos_id'], package['dhidden'], package['dlayer'], package['eprojection'], package['dprojection'], package['doffset'], package['atype'], package['edropout'], package['lsm_weight'], package['sampling_probability'], package['peak_left'], package['peak_right'], bidirectional_encoder=package['ebidirectional']) ctc = CTC( package['dvocab_size'], eprojs=package['eprojection'], dropout_rate=package['edropout'], ) encoder.flatten_parameters() model = cls(encoder, decoder, ctc, args) model.load_state_dict(package['state_dict']) LFR_m, LFR_n = package['LFR_m'], package['LFR_n'] return model, LFR_m, LFR_n
def load_model_from_package(cls, package, args): encoder = Encoder(package['einput'], package['ehidden'], package['elayer'], dropout=package['edropout'], bidirectional=package['ebidirectional'], rnn_type=package['etype']) decoder = Decoder(package['dvocab_size'], package['dembed'], package['dsos_id'], package['deos_id'], package['dhidden'], package['offset'] package['dlayer'], bidirectional_encoder=package['ebidirectional'] ) ctc = CTC(package['dvocab_size'], eprojs = package['ehidden'] * 2 if package['ebidirectional'] else package['ehidden'], dropout_rate = package['edropout'], ) encoder.flatten_parameters() model = cls(encoder, decoder, ctc, args) model.load_state_dict(package['state_dict']) return model
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) args.char_list = char_list vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model #import pdb #pdb.set_trace() encoder = Encoder(args.einput * args.LFR_m, args.ehidden, args.elayer, vocab_size, dropout=args.edropout, bidirectional=args.ebidirectional, rnn_type=args.etype) decoder = Decoder(vocab_size, args.dembed, sos_id, eos_id, args.dhidden, args.dlayer, args.offset, args.atype, dropout=args.edropout, bidirectional_encoder=args.ebidirectional) if args.ebidirectional: eprojs = args.ehidden * 2 else: eprojs = args.ehidden ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout) #lstm_model = Lstmctc.load_model(args.continue_from) model = Seq2Seq(encoder, decoder, ctc, args) #model_dict = model.state_dict() print(model) #print(lstm_model) #pretrained_dict = torch.load(args.ctc_model) #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict} #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict} #model_dict.update(pretrained_dict) #model.load_state_dict(model_dict) #for k,v in model.named_parameters(): # if k.startswith("encoder"): # print(k) # v.requires_grad=False model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver ctc = 0 solver = Solver(data, model, optimizier, args) solver.train()
DeepBiRNN(hidden_size, init=glorot, activation=Rectlinclip(), batch_norm=True, reset_cells=True, depth=depth), Affine(hidden_size, init=glorot, activation=Rectlinclip()), Affine(nout=nout, init=glorot, activation=Identity()) ] model = Model(layers=layers) opt = GradientDescentMomentumNesterov(learning_rate, momentum, gradient_clip_norm=gradient_clip_norm, stochastic_round=False) callbacks = Callbacks(model, eval_set=dev, **args.callback_args) # Print validation set word error rate at the end of every epoch pcb = WordErrorRateCallback(dev, argmax_decoder, max_tscrpt_len, epoch_freq=1) callbacks.add_callback(pcb) cost = GeneralizedCost(costfunc=CTC(max_tscrpt_len, nout=nout)) # Fit the model model.fit(train, optimizer=opt, num_epochs=args.epochs, cost=cost, callbacks=callbacks)
rec_to_o = Linear(name='rec_to_o', input_dim=rec_dim, output_dim=num_output_classes + 1) y_hat_pre = rec_to_o.apply(rnn_out) # y_hat_pre : T x B x C+1 # y_hat : T x B x C+1 y_hat = tensor.nnet.softmax( y_hat_pre.reshape((-1, num_output_classes + 1)) ).reshape((y_hat_pre.shape[0], y_hat_pre.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = input_mask # Cost cost = CTC().apply_log_domain(y, y_hat, y_len, y_hat_mask).mean() cost.name = 'CTC' dl, dl_length = CTC().best_path_decoding(y_hat, y_hat_mask) edit_distances = batch_edit_distance(dl.T.astype('int32'), dl_length, y.T.astype('int32'), y_len.astype('int32')) edit_distance = edit_distances.mean() edit_distance.name = 'edit_distance' errors_per_char = (edit_distances / y_len).mean() errors_per_char.name = 'errors_per_char' L = y.shape[0] B = y.shape[1] dl = dl[:L, :] is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:,None], y_len[None,:])
if normalize_out: y_hat_pre_mean = y_hat_pre.mean(axis=1).mean(axis=0) y_hat_pre_var = ((y_hat_pre - y_hat_pre_mean[None, None, :]) ** 2).mean(axis=1).mean(axis=0).sqrt() y_hat_pre = (y_hat_pre - y_hat_pre_mean[None, None, :]) / y_hat_pre_var[None, None, :] # y_hat : T x B x C+1 y_hat = tensor.nnet.softmax(y_hat_pre.reshape((-1, num_output_classes + 1))).reshape( (y_hat_pre.shape[0], y_hat_pre.shape[1], -1) ) y_hat.name = "y_hat" y_hat_mask = rec_mask # CTC COST AND ERROR MEASURE cost = CTC().apply_log_domain(y, y_hat, y_len, y_hat_mask).mean() cost.name = "CTC" dl, dl_length = CTC().best_path_decoding(y_hat, y_hat_mask) dl = dl[:L, :] dl_length = tensor.minimum(dl_length, L) edit_distances = batch_edit_distance( dl.T.astype("int32"), dl_length.astype("int32"), y.T.astype("int32"), y_len.astype("int32") ) edit_distance = edit_distances.mean() edit_distance.name = "edit_distance" errors_per_char = (edit_distances / y_len).mean() errors_per_char.name = "errors_per_char" is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:, None], y_len[None, :])