seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size=128 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train,
rnn_cell=params['rnn_cell']) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, rnn_cell=params['rnn_cell'], n_layers=params['n_layers'], eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) logging.info(seq2seq) # train t = SupervisedTrainer(loss=loss, batch_size=params['batch_size'],
optimizer = None if not opt.resume: # Initialize model hidden_size= 128 bidirectional = opt.bidirectional encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, dropout_p = 0.25,input_dropout_p = 0.25, bidirectional=bidirectional, n_layers=2, variable_lengths=True, vocab = input_vocab) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size*2 if bidirectional else hidden_size, dropout_p=0.2, input_dropout_p=0.25, use_attention=opt.use_attn, bidirectional=bidirectional, n_layers=2, eos_id=tgt.eos_id, sos_id=tgt.sos_id, attn_mode = opt.attn_mode) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.1, 0.1) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters(), lr = 0.001), max_grad_norm=5) #scheduler = StepLR(optimizer.optimizer, 1) scheduler = ReduceLROnPlateau(optimizer.optimizer, 'min', factor = 0.1, verbose=True, patience=9) optimizer.set_scheduler(scheduler) expt_dir = opt.expt_dir + '_hidden_{}'.format(hidden_size) # train t = SupervisedTrainer(loss=loss, batch_size=64, checkpoint_every=1800, print_every=100, expt_dir=expt_dir, input_vocab=input_vocab, output_vocab=output_vocab)
att_mlp=args.att_mlp, att_type=args.att_type) # decoder3 = DecoderRNN(args.decoder3_n_layer, args.vocab_size, max_len, hidden_size * 2 if bidirectional else hidden_size, # dropout_p=0.2, use_attention=True, bidirectional=bidirectional, # eos_id=eos_id, sos_id=sos_id) # seq2seq = Seq2seq(args, decoder1, decoder2, decoder3) seq2seq = Seq2seq(args, decoder1, decoder2) seq2seq.cuda() seq2seq = torch.nn.DataParallel(seq2seq) cudnn.benchmark = True print('Initialize model parameter ...') if args.init == 'uniform': print('uniform init !') for param in seq2seq.parameters(): param.data.uniform_(-args.init_weight, args.init_weight) elif args.init == 'mos': print('mos init !') for m in seq2seq.modules(): if type(m) in [nn.GRU, nn.LSTM, nn.RNN]: for name, param in m.named_parameters(): if 'weight_ih' in name: # torch.nn.init.xavier_uniform_(param.data) torch.nn.init.uniform_(param.data, -0.1, 0.1) elif 'weight_hh' in name: # torch.nn.init.orthogonal_(param.data) torch.nn.init.uniform_(param.data, -0.1, 0.1) elif 'bias' in name: param.data.fill_(0) else:
def offline_training(opt, traget_file_path): # Prepare dataset with torchtext src = SourceField(tokenize=treebank_tokenizer) tgt = TargetField(tokenize=treebank_tokenizer) def sample_filter(sample): """ sample example for future purpose""" return True train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) test = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) src.build_vocab(train, max_size=opt.src_vocab_size) tgt.build_vocab(train, max_size=opt.tgt_vocab_size) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] if opt.loss == 'perplexity': loss = Perplexity(weight, pad) else: raise TypeError seq2seq = None optimizer = None if not opt.resume: # Initialize model encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size, input_dropout_p=opt.intput_dropout_p, dropout_p=opt.dropout_p, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True, embedding=input_vocab.vectors if opt.use_pre_trained_embedding else None, update_embedding=opt.update_embedding) decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size * 2 if opt.bidirectional else opt.hidden_size, sos_id=tgt.sos_id, eos_id=tgt.eos_id, n_layers=opt.n_layers, rnn_cell=opt.rnn_cell, bidirectional=opt.bidirectional, input_dropout_p=opt.input_dropout_p, dropout_p=opt.dropout_p, use_attention=opt.use_attention) seq2seq = Seq2seq(encoder=encoder, decoder=decoder) if opt.gpu >= 0 and torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # train trainer = SupervisedTrainer(loss=loss, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=opt.print_every, expt_dir=opt.expt_dir) seq2seq = trainer.train(model=seq2seq, data=train, num_epochs=opt.epochs, resume=opt.resume, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=opt.teacher_forcing_rate)
variable_lengths=True, vectors=vocab.vectors) decoder = DecoderRNN(len(tgt.vocab), max_len, (hidden_size * 2) if bidirectional else hidden_size, n_layers=num_layers, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): # param.data.uniform_(-0.08, 0.08) param.data.normal_(0.0, 0.1) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) optimizer = Optimizer(torch.optim.SGD(seq2seq.parameters(), lr=0.05, momentum=0.9), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train