Пример #1
0
 def load_model_from_package(cls, package, args):
     encoder = Encoder(package['einput'],
                       package['ehidden'],
                       package['elayer'],
                       package['eprojection'],
                       dropout=package['edropout'],
                       bidirectional=package['ebidirectional'],
                       rnn_type=package['etype'])
     decoder = Decoder(package['dvocab_size'],
                       package['dembed'],
                       package['dsos_id'],
                       package['deos_id'],
                       package['dhidden'],
                       package['dlayer'],
                       package['eprojection'],
                       package['dprojection'],
                       package['doffset'],
                       package['atype'],
                       package['edropout'],
                       package['lsm_weight'],
                       package['sampling_probability'],
                       package['peak_left'],
                       package['peak_right'],
                       bidirectional_encoder=package['ebidirectional'])
     ctc = CTC(
         package['dvocab_size'],
         eprojs=package['eprojection'],
         dropout_rate=package['edropout'],
     )
     encoder.flatten_parameters()
     model = cls(encoder, decoder, ctc, args)
     model.load_state_dict(package['state_dict'])
     LFR_m, LFR_n = package['LFR_m'], package['LFR_n']
     return model, LFR_m, LFR_n
Пример #2
0
 def load_model_from_package(cls, package, args):
     encoder = Encoder(package['einput'],
                       package['ehidden'],
                       package['elayer'],
                       dropout=package['edropout'],
                       bidirectional=package['ebidirectional'],
                       rnn_type=package['etype'])
     decoder = Decoder(package['dvocab_size'],
                       package['dembed'],
                       package['dsos_id'],
                       package['deos_id'],
                       package['dhidden'],
                       package['offset']
                       package['dlayer'],
                       bidirectional_encoder=package['ebidirectional']
                       )
     ctc = CTC(package['dvocab_size'],
             eprojs = package['ehidden'] * 2 if package['ebidirectional'] else package['ehidden'],
               dropout_rate = package['edropout'],
               )
     encoder.flatten_parameters()
     model = cls(encoder, decoder, ctc, args)
     model.load_state_dict(package['state_dict'])
     return model
Пример #3
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    args.char_list = char_list
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    #import pdb
    #pdb.set_trace()
    encoder = Encoder(args.einput * args.LFR_m,
                      args.ehidden,
                      args.elayer,
                      vocab_size,
                      dropout=args.edropout,
                      bidirectional=args.ebidirectional,
                      rnn_type=args.etype)
    decoder = Decoder(vocab_size,
                      args.dembed,
                      sos_id,
                      eos_id,
                      args.dhidden,
                      args.dlayer,
                      args.offset,
                      args.atype,
                      dropout=args.edropout,
                      bidirectional_encoder=args.ebidirectional)
    if args.ebidirectional:
        eprojs = args.ehidden * 2
    else:
        eprojs = args.ehidden
    ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout)
    #lstm_model = Lstmctc.load_model(args.continue_from)

    model = Seq2Seq(encoder, decoder, ctc, args)
    #model_dict = model.state_dict()
    print(model)
    #print(lstm_model)
    #pretrained_dict = torch.load(args.ctc_model)
    #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict}
    #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict}
    #model_dict.update(pretrained_dict)
    #model.load_state_dict(model_dict)
    #for k,v in model.named_parameters():
    #    if k.startswith("encoder"):
    #        print(k)
    #        v.requires_grad=False
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    ctc = 0
    solver = Solver(data, model, optimizier, args)
    solver.train()
Пример #4
0
    DeepBiRNN(hidden_size,
              init=glorot,
              activation=Rectlinclip(),
              batch_norm=True,
              reset_cells=True,
              depth=depth),
    Affine(hidden_size, init=glorot, activation=Rectlinclip()),
    Affine(nout=nout, init=glorot, activation=Identity())
]

model = Model(layers=layers)

opt = GradientDescentMomentumNesterov(learning_rate,
                                      momentum,
                                      gradient_clip_norm=gradient_clip_norm,
                                      stochastic_round=False)
callbacks = Callbacks(model, eval_set=dev, **args.callback_args)

# Print validation set word error rate at the end of every epoch
pcb = WordErrorRateCallback(dev, argmax_decoder, max_tscrpt_len, epoch_freq=1)
callbacks.add_callback(pcb)

cost = GeneralizedCost(costfunc=CTC(max_tscrpt_len, nout=nout))

# Fit the model
model.fit(train,
          optimizer=opt,
          num_epochs=args.epochs,
          cost=cost,
          callbacks=callbacks)
Пример #5
0
rec_to_o = Linear(name='rec_to_o',
                  input_dim=rec_dim,
                  output_dim=num_output_classes + 1)
y_hat_pre = rec_to_o.apply(rnn_out)
# y_hat_pre : T x B x C+1

# y_hat : T x B x C+1
y_hat = tensor.nnet.softmax(
    y_hat_pre.reshape((-1, num_output_classes + 1))
).reshape((y_hat_pre.shape[0], y_hat_pre.shape[1], -1))
y_hat.name = 'y_hat'

y_hat_mask = input_mask

# Cost
cost = CTC().apply_log_domain(y, y_hat, y_len, y_hat_mask).mean()
cost.name = 'CTC'

dl, dl_length = CTC().best_path_decoding(y_hat, y_hat_mask)

edit_distances = batch_edit_distance(dl.T.astype('int32'), dl_length, y.T.astype('int32'),
                                      y_len.astype('int32'))
edit_distance = edit_distances.mean()
edit_distance.name = 'edit_distance'
errors_per_char = (edit_distances / y_len).mean()
errors_per_char.name = 'errors_per_char'

L = y.shape[0]
B = y.shape[1]
dl = dl[:L, :]
is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:,None], y_len[None,:])
Пример #6
0
if normalize_out:
    y_hat_pre_mean = y_hat_pre.mean(axis=1).mean(axis=0)
    y_hat_pre_var = ((y_hat_pre - y_hat_pre_mean[None, None, :]) ** 2).mean(axis=1).mean(axis=0).sqrt()
    y_hat_pre = (y_hat_pre - y_hat_pre_mean[None, None, :]) / y_hat_pre_var[None, None, :]

# y_hat : T x B x C+1
y_hat = tensor.nnet.softmax(y_hat_pre.reshape((-1, num_output_classes + 1))).reshape(
    (y_hat_pre.shape[0], y_hat_pre.shape[1], -1)
)
y_hat.name = "y_hat"

y_hat_mask = rec_mask

#       CTC COST AND ERROR MEASURE
cost = CTC().apply_log_domain(y, y_hat, y_len, y_hat_mask).mean()
cost.name = "CTC"

dl, dl_length = CTC().best_path_decoding(y_hat, y_hat_mask)
dl = dl[:L, :]
dl_length = tensor.minimum(dl_length, L)

edit_distances = batch_edit_distance(
    dl.T.astype("int32"), dl_length.astype("int32"), y.T.astype("int32"), y_len.astype("int32")
)
edit_distance = edit_distances.mean()
edit_distance.name = "edit_distance"
errors_per_char = (edit_distances / y_len).mean()
errors_per_char.name = "errors_per_char"

is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:, None], y_len[None, :])