decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=max_len, hidden_size=hidden_size * 2 if bidirectional else 1, dropout_p=opt.dropout, use_attention=True, bidirectional=bidirectional, n_layers=1, rnn_cell='gru', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) print(param.data[0:3]) _, _, norm_val = encoder.vectors_stats() encoder.init_vectors(src.vocab.vectors) # encoder.scale_vectors(0.08) encoder.normalize_vectors(norm_val) encoder.vectors_stats() for param in seq2seq.parameters(): print(param.data[0:3]) if torch.cuda.is_available(): seq2seq.cuda() # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters(), lr=0.001), max_grad_norm=5) # optimizer = Optimizer(torch.optim.SGD(seq2seq.parameters(), lr=0.01, momentum=0.9), max_grad_norm=5)
if not opt.resume: # Initialize model # hidden_size=128 hidden_size = 300 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else 1, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) print(param.data) encoder.vectors_stats() # encoder.init_vectors(src.vocab.vectors) # for param in seq2seq.parameters(): # print(param.data) if torch.cuda.is_available(): seq2seq.cuda() # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, step_size=10, gamma=0.5) optimizer.set_scheduler(scheduler) # train