def test_set_parameters(self): learning_rate = 1 optim = Optimizer(torch.optim.SGD, lr=learning_rate) params = [torch.nn.Parameter(torch.randn(2, 3, 4))] optim.set_parameters(params) self.assertTrue(type(optim.optimizer) is torch.optim.SGD) self.assertEquals(optim.optimizer.param_groups[0]['lr'], learning_rate)
def test_update(self): params = [torch.nn.Parameter(torch.randn(2, 3, 4))] optimizer = Optimizer(torch.optim.Adam(params, lr=1), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1, gamma=0.1) optimizer.set_scheduler(scheduler) optimizer.step() optimizer.update(10, 1) self.assertEqual(0.1, optimizer.optimizer.param_groups[0]['lr'])
def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def train(self, model, data, num_epochs=5, resume=False, dev_data=None, monitor_data={}, optimizer=None, teacher_forcing_ratio=0, learning_rate=0.001, checkpoint_path=None, top_k=5): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) learing_rate (float, optional): learning rate used by the optimizer (default 0.001) checkpoint_path (str, optional): path to load checkpoint from in case training should be resumed top_k (int): how many models should be stored during training Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 def get_optim(optim_name): optims = {'adam': optim.Adam, 'adagrad': optim.Adagrad, 'adadelta': optim.Adadelta, 'adamax': optim.Adamax, 'rmsprop': optim.RMSprop, 'sgd': optim.SGD, None:optim.Adam} return optims[optim_name] self.optimizer = Optimizer(get_optim(optimizer)(model.parameters(), lr=learning_rate), max_grad_norm=5) self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) logs = self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, monitor_data=monitor_data, teacher_forcing_ratio=teacher_forcing_ratio, top_k=top_k) return model, logs
def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100, optimizer=Optimizer(optim.Adam, max_grad_norm=5)): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = optimizer self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.input_vocab_file = os.path.join(self.expt_dir, 'input_vocab') self.output_vocab_file = os.path.join(self.expt_dir, 'output_vocab') self.logger = logging.getLogger(__name__)
def test_init(self): params = [torch.nn.Parameter(torch.randn(2,3,4))] try: optimizer = Optimizer(torch.optim.Adam(params)) except: self.fail("__init__ failed.") self.assertEquals(optimizer.max_grad_norm, 0)
def initialize_model( train, input_vocab, output_vocab, max_len=10, hidden_size=256, dropout_p=0.5, bidirectional=True, n_beam=5, ): # Initialize model encoder = EncoderRNN( len(input_vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True, ) decoder = DecoderRNN( len(output_vocab), max_len, hidden_size * (2 if bidirectional else 1), dropout_p=dropout_p, use_attention=True, bidirectional=bidirectional, eos_id=train.tgt_field.eos_id, sos_id=train.tgt_field.sos_id, ) # decoder = TopKDecoder(decoder ,n_beam) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq = seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) return seq2seq, optimizer, scheduler
def test_init(self): try: optimizer = Optimizer(torch.optim.SGD) except: self.fail("__init__ failed.") self.assertEquals(optimizer.max_grad_norm, 0) self.assertEquals(optimizer.lr_decay, 1) self.assertEquals(optimizer.decay_after_epoch, 0)
def pretrain_generator(model, train, dev): # pre-train generator weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() optimizer = Optimizer(torch.optim.Adam(gen.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) supervised = SupervisedTrainer(loss=loss, batch_size=32, random_seed=random_seed, expt_dir=expt_gen_dir) supervised.train(model, train, num_epochs=20, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0, resume=resume)
def test_update(self): optim = Optimizer(torch.optim.SGD, lr=1, decay_after_epoch=5, lr_decay=0.5) params = [torch.nn.Parameter(torch.randn(2, 3, 4))] optim.set_parameters(params) optim.update(0, 10) self.assertEquals(optim.optimizer.param_groups[0]['lr'], 0.5)
def train(self, model, data, n_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """Train a given model. Args: model (seq2seq.models): model to run training on. If resume=True, it will be overwritten by the model loaded from the latest checkpoint data (seq2seq.dataset.dataset.Dataset): dataset object to train on n_epochs (int): number of epochs to run resume(bool): resume training with the latest checkpoint dev_data (seq2seq.dataset.dataset.Dataset): dev Dataset optimizer (seq2seq.optim.Optimizer): optimizer for training teacher_forcing_ratio (float): teaching forcing ratio Returns: model (seq2seq.models): trained model. """ if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.experiment_directory) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A work-around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer( optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer logger.info('Optimizer: %s, Scheduler: %s', self.optimizer.optimizer, self.optimizer.scheduler) self._train_epochs(data, model, n_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def train(self, encoder, decoder, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, is_training=0): if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) decoder = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( decoder.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(decoder.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, encoder, decoder, num_epochs, start_epoch, step, dev_data=dev_data, is_training=is_training) return decoder
def train_model(m, poly, pretraining): m.train() optimizer = Optimizer(torch.optim.Adam( m.parameters(), amsgrad=True), max_grad_norm=5) t = MirrorTrainer(loss=loss, batch_size=args.batch_size, checkpoint_every=100, expt_dir="./experiments", pretraining=pretraining, polyglot=poly, explosion_train=args.explosion_train, explosion_eval=args.explosion_eval) m = t.train(m, train_dataset, n_epochs=args.n_epochs, dev_data=(None if args.no_dev_eval == 1 else dev_dataset), test_data=(None if args.no_test_eval == 1 else test_dataset), optimizer=optimizer, teacher_forcing_ratio=args.teacher_forcing_ratio, resume=False) return m
def train(self, model, data, start_step=0, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer if not self.multi_gpu or hvd.rank() == 0: self.logger.info( "Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, start_step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def run_training(opt, default_data_dir, num_epochs=100): if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt') logging.info("Starting new Training session on %s", data_file) def len_filter(example): return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \ and (len(example.src) > 0) and (len(example.tgt) > 0) train = torchtext.data.TabularDataset( path=data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) dev = None if opt.no_dev is False: dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt') dev = torchtext.data.TabularDataset( path=dev_data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): logging.info("Yayyy We got CUDA!!!") loss.cuda() else: logging.info("No cuda available device found running on cpu") seq2seq = None optimizer = None if not opt.resume: hidden_size = 128 decoder_hidden_size = hidden_size * 2 logging.info("EncoderRNN Hidden Size: %s", hidden_size) logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size) bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=0, use_attention=True, bidirectional=bidirectional, rnn_cell='lstm', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train num_epochs = num_epochs batch_size = 32 checkpoint_every = num_epochs / 10 print_every = num_epochs / 100 properties = dict(batch_size=batch_size, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir, num_epochs=num_epochs, teacher_forcing_ratio=0.5, resume=opt.resume) logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2)) t = SupervisedTrainer(loss=loss, batch_size=num_epochs, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) evaluator = Evaluator(loss=loss, batch_size=batch_size) if opt.no_dev is False: dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) logging.info("Dev Loss: %s", dev_loss) logging.info("Accuracy: %s", dev_loss) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) while True: try: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() results = predictor.predict_n(seq, n=3) for i, res in enumerate(results): print('option %s: %s\n', i + 1, res) except KeyboardInterrupt: logging.info("Bye Bye") exit(0)
def test_step(self, mock_clip_grad_norm): params = [torch.nn.Parameter(torch.randn(2,3,4))] optim = Optimizer(torch.optim.Adam(params), max_grad_norm=5) optim.step() mock_clip_grad_norm.assert_called_once()
if multi_gpu else None dev = DataLoader(dev_set, batch_size=opt.batch_size, shuffle=False, sampler=dev_sampler, collate_fn=trans_data.collate_fn) # Prepare optimizer # optimizer = Optimizer(optim.Adam(seq2seq.parameters(), lr=opt.learning_rate), max_grad_norm=opt.clip_grad) optimizer = optim.Adam(seq2seq.parameters(), lr=opt.learning_rate) if multi_gpu: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=seq2seq.named_parameters()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) hvd.broadcast_parameters(seq2seq.state_dict(), root_rank=0) optimizer = Optimizer(optimizer, max_grad_norm=opt.clip_grad) if opt.decay_factor: optimizer.set_scheduler( torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer.optimizer, 'min', factor=opt.decay_factor, patience=1)) # Prepare trainer and train t = SupervisedTrainer(loss=loss, model_dir=opt.model_dir, best_model_dir=opt.best_model_dir, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=opt.print_every,
# Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=1000, print_every=10, expt_dir=opt.expt_dir) optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) seq2seq = t.train(seq2seq, train, num_epochs=20, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2seq, input_vocab, output_vocab) while True: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() print(predictor.predict(seq))
print(f"\nLoad from {opt.load_checkpoint}\n") else: for param in seq2seq.parameters(): param.data.uniform_(-opt.init_weight, opt.init_weight) if opt.beam_width > 1 and opt.phase == "infer": print(f"Beam Width {opt.beam_width}") seq2seq.decoder = TopKDecoder(seq2seq.decoder, opt.beam_width) if opt.phase == "train": # train # optimizer = Optimizer(optim.Adam(seq2seq.parameters(), lr=opt.learning_rate), max_grad_norm=opt.clip_grad) optimizer = optim.Adam(seq2seq.parameters(), lr=opt.learning_rate) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=seq2seq.named_parameters()) optimizer = Optimizer(optimizer, max_grad_norm=opt.clip_grad) hvd.broadcast_optimizer_state(optimizer.optimizer, root_rank=0) hvd.broadcast_parameters(seq2seq.state_dict(), root_rank=0) t = SupervisedTrainer(loss=loss, model_dir=opt.model_dir, best_model_dir=opt.best_model_dir, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=opt.print_every, max_epochs=opt.max_epochs, max_steps=opt.max_steps, max_checkpoints_num=opt.max_checkpoints_num, best_ppl=opt.best_ppl, device=device)
if torch.cuda.is_available(): seq2tree.cuda() for param in seq2tree.parameters(): param.data.uniform_(-0.08, 0.08) # encoder.embedding.weight.data.set_(input_vocab.vectors) # encoder.embedding.weight.data.set_(output_vocab.vectors) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) optimizer = Optimizer(optim.Adam(seq2tree.parameters(), lr=1e-4), max_grad_norm=5) # train t = SupervisedTrainer(loss=loss, batch_size=1, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2tree = t.train(seq2tree, train, num_epochs=20, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2tree, input_vocab, input_vocab) while True: seq_str = raw_input("Type in a source sequence:")
eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): # param.data.uniform_(-0.08, 0.08) param.data.normal_(0.0, 0.1) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) optimizer = Optimizer(torch.optim.SGD(seq2seq.parameters(), lr=0.05, momentum=0.9), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train t = MultiLabelTrainer(loss=loss, batch_size=64, checkpoint_every=10000, print_every=100, ckpt_dir=opt.ckpt_dir) print('Start training') seq2seq = t.train(seq2seq, train, num_epochs=opt.epochs,
rnn_cell='lstm', dropout_p=0.25, use_attention=True, bidirectional=bidirectional, n_layers=2, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq_model = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq_model.cuda() for param in seq2seq_model.parameters(): param.data.uniform_(-0.1, 0.1) optimizer = Optimizer(torch.optim.Adam(seq2seq_model.parameters()), max_grad_norm=5) # In[20]: seq2seq_model = torch.nn.DataParallel(seq2seq_model) # In[21]: # train t = SupervisedTrainer(loss=loss, batch_size=8, checkpoint_every=200, print_every=10000, expt_dir='./lstm_model/' + data_tuple[0] + '/Deepregex')
def train_iters(self, pairs, n_iters, batch_size=64, print_every=1000, learning_rate=0.0002, teacher_forcing_ratio=0.5): """Train for some number of iterations choosing randomly from the list of tensor pairs.""" print("Initializing training.") if self.optimizer == None: adam = optim.Adam(self.decoder.parameters(), lr=learning_rate) self.optimizer = Optimizer(adam, max_grad_norm=5) else: print("Using existing optimizer.") random.shuffle(pairs) if (len(pairs) < batch_size): print("Not enough examples for one batch.") return # Turn the pairs into big tensors. # TODO: instead of saving pairs, save tensors directly. Otherwise this operation takes too much space. # Input: num_layers x num_examples x embedding_size # Target: num_examples x max_output_length+1 input_tensors = [torch.reshape(i, (1, 1, -1)) for i, j in pairs] input_tensor = torch.cat(input_tensors, 1) input_tensor = self._create_init_hidden(input_tensor) target_tensors = [j for i, j in pairs] targets = [] for target in target_tensors: target_tensor = torch.reshape(target, (1, -1)) if target_tensor.size(1) >= self.max_output_length: target_tensor = target_tensor[0][0:self.max_output_length] target_tensor = torch.reshape(target_tensor, (1, -1)) else: pad = torch.zeros( 1, self.max_output_length - target_tensor.size(1)).long() for i in range(self.max_output_length - target_tensor.size(1)): pad[0][i] = self.mask_token target_tensor = torch.cat((target_tensor, pad), 1) # Add the start token. start_tensor = torch.zeros(1, 1).long() start_tensor[0][0] = self.SOS_token target_tensor = torch.cat((start_tensor, target_tensor), 1) targets.append(target_tensor) target_tensor = torch.cat(targets, 0) if torch.cuda.is_available(): target_tensor = target_tensor.cuda() if torch.cuda.is_available(): input_tensor = input_tensor.cuda() print("Starting training.") print_loss_total = 0 # Reset every print_every. batch = 0 for iter in range(n_iters): # Create the batch. if (batch + 1) * batch_size > len(pairs): print("Finished an epoch!") batch = 0 batch_input = input_tensor[:, batch * batch_size:(batch + 1) * batch_size, :].contiguous() batch_target = target_tensor[batch * batch_size:(batch + 1) * batch_size, :].contiguous() if self.rnn_cell == 'lstm': batch_input = (batch_input, batch_input) loss = self.train(batch_input, batch_target, teacher_forcing_ratio=teacher_forcing_ratio) print_loss_total += loss if iter % print_every == print_every - 1: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('Steps: {0}\nAverage loss: {1}'.format( iter, print_loss_avg)) batch += 1
def train(opt): LOG_FORMAT = '%(asctime)s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, opt.log_level.upper())) logging.info(opt) if int(opt.GPU) >= 0: torch.cuda.set_device(int(opt.GPU)) if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2tree = checkpoint.model input_vocab = checkpoint.input_vocab else: # Prepare dataset src = SourceField() nt = NTField() pos = PosField() tgt_tree = TreeField() comp = CompField() max_len = opt.max_len def len_filter(example): return len(example.src) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('nt', nt), ('pos', pos), ('tree', tgt_tree)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('nt', nt), ('pos', pos), ('tree', tgt_tree)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) comp.build_vocab(train, max_size=50000) nt.build_vocab(train, max_size=50000) pos.build_vocab(train, max_size=50000) # src_tree.build_vocab(train, max_size=50000) pos_in_nt = set() for Pos in pos.vocab.stoi: if nt.vocab.stoi[Pos] > 1: pos_in_nt.add(nt.vocab.stoi[Pos]) hidden_size = opt.hidden_size input_vocab = src.vocab nt_vocab = nt.vocab def tree_to_id(tree): tree.set_label(nt_vocab.stoi[tree.label()]) if len(tree) == 1 and str(tree[0])[0] is not '(': tree[0] = input_vocab.stoi[tree[0]] return else: for subtree in tree: tree_to_id(subtree) tree.append(Tree(nt_vocab.stoi['<eos>'], [])) return tree # train.examples = [str(tree_to_id(ex.tree)) for ex in train.examples] # dev.examples = [str(tree_to_id(ex.tree)) for ex in dev.examples] for ex in train.examples: ex.tree = str(tree_to_id(Tree.fromstring(ex.tree))) for ex in dev.examples: ex.tree = str(tree_to_id(Tree.fromstring(ex.tree))) # train.examples = [tree_to_id(Tree.fromstring(ex.tree)) for ex in train.examples] # dev.examples = [str(tree_to_id(Tree.fromstring(ex.tree))) for ex in dev.examples] if opt.word_embedding is not None: input_vocab.load_vectors([opt.word_embedding]) loss = NLLLoss() if torch.cuda.is_available(): loss.cuda() loss.reset() seq2tree = None optimizer = None if not opt.resume: # Initialize model bidirectional = opt.bidirectional_encoder encoder = EncoderRNN(len(src.vocab), opt.word_embedding_size, max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderTree(len(src.vocab), opt.word_embedding_size, opt.nt_embedding_size, len(nt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, sos_id=nt_vocab.stoi['<sos>'], eos_id=nt_vocab.stoi['<eos>'], dropout_p=0.2, use_attention=True, bidirectional=bidirectional, pos_in_nt=pos_in_nt) seq2tree = Seq2tree(encoder, decoder) if torch.cuda.is_available(): seq2tree.cuda() for param in seq2tree.parameters(): param.data.uniform_(-0.08, 0.08) # encoder.embedding.weight.data.set_(input_vocab.vectors) # encoder.embedding.weight.data.set_(output_vocab.vectors) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) optimizer = Optimizer(optim.Adam(seq2tree.parameters(), lr=opt.lr), max_grad_norm=5) # train t = SupervisedTrainer(loss=loss, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=10, expt_dir=opt.expt_dir, lr=opt.lr) seq2tree = t.train(seq2tree, train, num_epochs=opt.epoch, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0, resume=opt.resume) predictor = Predictor(seq2tree, input_vocab, nt_vocab) return predictor, dev, train
encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, n_layers=2, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, n_layers=2, dropout_p=0.5, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters(), amsgrad=True, weight_decay=0.0005), max_grad_norm=10) optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=10) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=128, checkpoint_every=200, print_every=200, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=5, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2seq, input_vocab, output_vocab)
def main(option): random.seed(option.random_seed) torch.manual_seed(option.random_seed) LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level='INFO', stream=sys.stdout) glove = Glove(option.emb_file) logging.info('loaded embeddings from ' + option.emb_file) src_vocab = Vocab.build_from_glove(glove) tgt_vocab = Vocab.load(option.intent_vocab) train_dataset = load_intent_prediction_dataset(option.train_dataset, src_vocab, tgt_vocab, device=option.device) dev_dataset = load_intent_prediction_dataset(option.dev_dataset, src_vocab, tgt_vocab, device=option.device) train_data_loader = DataLoader(train_dataset, batch_size=option.batch_size, shuffle=True) dev_data_loader = DataLoader(dev_dataset, batch_size=len(dev_dataset), shuffle=False) src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) # Prepare loss weight = torch.ones(tgt_vocab_size) pad = tgt_vocab.stoi[tgt_vocab.pad_token] loss = Perplexity(weight, pad) loss.criterion.to(option.device) # Initialize model encoder = NeuralTensorNetwork(nn.Embedding(src_vocab_size, option.emb_dim), option.em_k) decoder = DecoderRNN(tgt_vocab_size, option.im_max_len, option.im_hidden_size, use_attention=False, bidirectional=False, eos_id=tgt_vocab.stoi[tgt_vocab.eos_token], sos_id=tgt_vocab.stoi[tgt_vocab.bos_token]) encoder.to(option.device) decoder.to(option.device) init_model(encoder) init_model(decoder) encoder.embeddings.weight.data.copy_(torch.from_numpy(glove.embd).float()) optimizer_params = [{ 'params': encoder.parameters() }, { 'params': decoder.parameters() }] optimizer = Optimizer(optim.Adam(optimizer_params, lr=option.lr), max_grad_norm=5) trainer = NTNTrainer(loss, print_every=option.report_every, device=option.device) encoder, decoder = trainer.train( encoder, decoder, optimizer, train_data_loader, num_epochs=option.epochs, dev_data_loader=dev_data_loader, teacher_forcing_ratio=option.im_teacher_forcing_ratio) predictor = NTNPredictor(encoder, decoder, src_vocab, tgt_vocab, option.device) samples = [ ("PersonX", "eventually told", "___"), ("PersonX", "tells", "PersonY 's tale"), ("PersonX", "always played", " ___"), ("PersonX", "would teach", "PersonY"), ("PersonX", "gets", "a ride"), ] for sample in samples: subj, verb, obj = sample subj = subj.lower().split(' ') verb = verb.lower().split(' ') obj = obj.lower().split(' ') print(sample, predictor.predict(subj, verb, obj))
SpeakerDataset.concat(args.num_sentence, (train, dev, test)) ################ define model ################## model, input_vocab, output_vocab = init_model() # Define loss weight = torch.ones(len(output_vocab)) pad = output_vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() # Define Optimizer optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=args.max_grad_norm) ############### train model ################ t = SpkTrainer(args=args, loss=loss, batch_size=args.batch_size, checkpoint_every=args.ckpt_every, random_seed=args.seed, print_every=args.verbose, expt_dir=args.expt_dir) discrim = t.train(model=model, data=train, num_epochs=args.epochs, dev_data=dev,
param.data.uniform_(-0.08, 0.08) print(param.data[0:3]) _, _, norm_val = encoder.vectors_stats() encoder.init_vectors(src.vocab.vectors) # encoder.scale_vectors(0.08) encoder.normalize_vectors(norm_val) encoder.vectors_stats() for param in seq2seq.parameters(): print(param.data[0:3]) if torch.cuda.is_available(): seq2seq.cuda() # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters(), lr=0.001), max_grad_norm=5) # optimizer = Optimizer(torch.optim.SGD(seq2seq.parameters(), lr=0.01, momentum=0.9), max_grad_norm=5) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer.optimizer, step_size=10, gamma=0.5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer.optimizer, mode='min', factor=0.5, patience=5, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) optimizer.set_scheduler(scheduler)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=3) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=1024) parser.add_argument('-n_layer', type=int, default=1) parser.add_argument('-dropout', type=float, default=0) parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-seed', type=int, default=42, help="random seed for initialization") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-teacher_forcing_ratio', type=float, default=0.5) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.log = opt.save_model random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.cuda: torch.cuda.manual_seed_all(opt.seed) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') # model opt.bidirectional = True encoder = EncoderRNN(opt.src_vocab_size, opt.max_token_seq_len, opt.d_model, bidirectional=opt.bidirectional, variable_lengths=True) decoder = DecoderRNN(opt.tgt_vocab_size, opt.max_token_seq_len, opt.d_model * 2 if opt.bidirectional else opt.d_model, n_layers=opt.n_layer, dropout_p=opt.dropout, use_attention=True, bidirectional=opt.bidirectional, eos_id=Constants.BOS, sos_id=Constants.EOS) seq2seq = Seq2seq(encoder, decoder).to(device) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) seq2seq = nn.DataParallel(seq2seq) # loss weight = torch.ones(opt.tgt_vocab_size) pad = Constants.PAD loss = Perplexity(weight, pad) if opt.cuda: loss.cuda() # optimizer optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) train(seq2seq, training_data, validation_data, loss, optimizer, device, opt)