def test_set_eval_mode(self, mock_eval, mock_call): """ Make sure that evaluation is done in evaluation mode. """ mock_mgr = MagicMock() mock_mgr.attach_mock(mock_eval, 'eval') mock_mgr.attach_mock(mock_call, 'call') evaluator = Evaluator() evaluator.evaluate(self.seq2seq, self.dataset) expected_calls = [call.eval()] + \ self.dataset.num_batches(evaluator.batch_size) * [call.call(ANY, ANY, volatile=ANY)] self.assertEquals(expected_calls, mock_mgr.mock_calls)
def test_set_eval_mode(self, mock_eval, mock_call): """ Make sure that evaluation.txt is done in evaluation.txt mode. """ mock_mgr = MagicMock() mock_mgr.attach_mock(mock_eval, 'eval') mock_mgr.attach_mock(mock_call, 'call') evaluator = Evaluator(batch_size=64) with patch('seq2seq.evaluator.evaluator.torch.stack', return_value=None), \ patch('seq2seq.loss.NLLLoss.eval_batch', return_value=None): evaluator.evaluate(self.seq2seq, self.dataset) num_batches = int(math.ceil(len(self.dataset) / evaluator.batch_size)) expected_calls = [call.eval()] + num_batches * [call.call(ANY, ANY, ANY)] self.assertEquals(expected_calls, mock_mgr.mock_calls)
def _evaluate(checkpoint_path, test_paths, metric_names=[ "word accuracy", "sequence accuracy", "final target accuracy" ], loss_names=["nll"], max_len=50, batch_size=32, is_predict_eos=True, content_method=None): """Evaluates the models saved in a checkpoint.""" results = [] print("loading checkpoint from {}".format(os.path.join(checkpoint_path))) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model tabular_data_fields = get_tabular_data_fields( content_method=content_method, is_predict_eos=is_predict_eos) dic_data_fields = dict(tabular_data_fields) src = dic_data_fields["src"] tgt = dic_data_fields["tgt"] src.vocab = checkpoint.input_vocab tgt.vocab = checkpoint.output_vocab tgt.eos_id = tgt.vocab.stoi[tgt.SYM_EOS] tgt.sos_id = tgt.vocab.stoi[tgt.SYM_SOS] for test_path in test_paths: test = get_data(test_path, max_len, tabular_data_fields) metrics = get_metrics(metric_names, src, tgt, is_predict_eos) losses, loss_weights = get_losses(loss_names, tgt, is_predict_eos) evaluator = Evaluator(loss=losses, batch_size=batch_size, metrics=metrics) data_func = SupervisedTrainer.get_batch_data losses, metrics = evaluator.evaluate(model=seq2seq, data=test, get_batch_data=data_func) total_loss, log_msg, _ = SupervisedTrainer.get_losses( losses, metrics, 0) dataset = test_path.split('/')[-1].split('.')[0] results.append([dataset, total_loss] + [metric.get_val() for metric in metrics]) results_df = pd.DataFrame(results, columns=["Dataset", "Loss"] + [metric.name for metric in metrics]) results_df = results_df.melt(id_vars=['Dataset'], var_name="Metric", value_name='Value') return results_df
optimizer_new = Optimizer( torch.optim.Adadelta(seq2seq_model.parameters(), lr=0.05)) #if you want to train by oracle, put mode to None sc_t = SelfCriticalTrainer(loss=PositiveLoss(mode='prob', prob_model=compare_regex_model, loss_vocab=sc_loss_vocab), batch_size=32, checkpoint_every=100, print_every=100, expt_dir='./lstm_model/' + data_tuple[0] + '/SoftRegex', output_vocab=output_vocab) seq2seq_model = sc_t.train(seq2seq_model, train, num_epochs=30, dev_data=dev, optimizer=optimizer_new, teacher_forcing_ratio=0.5, resume=False) # In[26]: evaluator = Evaluator() # In[27]: evaluator.evaluate(seq2seq_model, dev) # (5.799417234628771, 0.6468332123976366)
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, state_loss=NLLLoss(), checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.state_loss = state_loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio, concept=None, vocabs=None, use_concept=False): loss = self.loss if use_concept: state_loss = self.state_loss state_loss.reset() # Forward propagation if use_concept: (decoder_outputs, decoder_hidden, other), (state, response_concept) = model( input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio, concept=concept, vocabs=vocabs, use_concept=use_concept) else: decoder_outputs, decoder_hidden, other = model( input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio, concept=concept, vocabs=vocabs, use_concept=use_concept) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) """ if use_concept: for i in range(len(response_concept)): for j in range(len(response_concept[i])): arg1 = state[i].unsqueeze(0) arg2 = torch.tensor([response_concept[i][j]]) if torch.cuda.is_available(): arg2 = arg2.cuda() state_loss.acc_loss += state_loss.criterion(arg1, arg2) state_loss.norm_term += 1 """ """ for i in range(response_concept.shape[1]): state_loss.eval_batch(state.contiguous().view(batch_size, -1), response_concept[:, i]) """ # Backward propagation model.zero_grad() lvalue = loss.get_loss() if use_concept: #state_value = state_loss.get_loss() state_value = 0 if lvalue >= 0: if use_concept: # loss.backward(retain_graph=True) loss.backward() # state_loss.backward() else: loss.backward() self.optimizer.step() else: raise AssertionError("NAN Triggered!") if use_concept: return lvalue, state_value else: return lvalue def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, save_file=False, dev_data=None, teacher_forcing_ratio=0, vocabs=None, use_concept=False, log_dir=None, embed_file=None): log = self.logger # embed = Embed(embed_file) embed = [] print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = torch.device('cuda', 0) if torch.cuda.is_available() else None batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) print("Steps per epoch: ", steps_per_epoch) total_steps = steps_per_epoch * n_epochs """ dev_loss, accuracy = self.evaluator.evaluate(model, dev_data, vocabs=vocabs, use_concept=use_concept, log_dir=log_dir, cur_step=0) """ step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): print("Epoch: %d, Step: %d" % (epoch, step)) #if epoch > 20: # teacher_forcing_ratio = 0 batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) """ # for debugging dev_loss, accuracy = self.evaluator.evaluate(model, dev_data, vocabs=vocabs, use_concept=use_concept, cur_step=step, log_dir=log_dir) """ model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr( batch, seq2seq.src_field_name) if use_concept: concepts, _ = getattr(batch, seq2seq.cpt_field_name) else: concepts = [] target_variables = getattr(batch, seq2seq.tgt_field_name) if use_concept: loss, state_loss = self._train_batch( input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio, concept=concepts, vocabs=vocabs, use_concept=use_concept) else: loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio, concept=concepts, vocabs=vocabs, use_concept=use_concept) state_loss = 0 # FOR NAN DEBUG if not loss >= 0: Checkpoint( model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[ seq2seq.tgt_field_name].vocab).save(self.expt_dir) print("Nan Triggered! Model has been saved.") exit(0) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Step %d, Progress: %d%%, Train %s: %.4f, State loss: %.4f' % ( step, step / total_steps * 100, self.loss.name, print_loss_avg, state_loss) log.info(log_msg) if step % 200 == 0: dev_loss, accuracy = self.evaluator.evaluate( model, dev_data, vocabs=vocabs, use_concept=use_concept, cur_step=step, log_dir=log_dir) # self.optimizer.update(dev_loss, epoch) log_msg = "Step %d, Dev %s: %.4f, Accuracy: %.4f" % ( step, self.loss.name, dev_loss, accuracy) log.info(log_msg) model.train(mode=True) # Checkpoint """ if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save(self.expt_dir) """ if epoch % 5 == 0 and save_file: Checkpoint( model=model, optimizer=self.optimizer, epoch=n_epochs, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[ seq2seq.tgt_field_name].vocab).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % ( epoch, self.loss.name, epoch_loss_avg) with open(log_dir + '/log.txt', 'a+', encoding='utf-8') as file: file.write("Step {}, avg loss: {}\n".format( step, epoch_loss_avg)) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate( model, dev_data, vocabs=vocabs, use_concept=use_concept, log_dir=log_dir, cur_step=step) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) Checkpoint( model=model, optimizer=self.optimizer, epoch=n_epochs, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save( self.expt_dir) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0, src_vocab=None, cpt_vocab=None, tgt_vocab=None, use_concept=False, vocabs=None, save_file=False, log_dir=None, embed_file=None, full_matrix=None): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. :param log_dir: """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model model.full_matrix = full_matrix self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters(), weight_decay=0), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, log_dir=log_dir, embed_file=embed_file, vocabs=vocabs, use_concept=use_concept, save_file=save_file) return model
src.vocab = input_vocab tgt.vocab = output_vocab max_len = opt.max_len def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len # generate test set test = torchtext.data.TabularDataset( path=opt.test_data, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) # Prepare loss weight = torch.ones(len(output_vocab)) pad = output_vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() ################################################################################# # Evaluate model on test set evaluator = Evaluator(loss=loss, batch_size=opt.batch_size) losses, metrics = evaluator.evaluate(seq2seq, test) total_loss, log_msg, _ = SupervisedTrainer.print_eval(losses, metrics, 0) print(log_msg)
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of epochs to checkpoint after, (default: 100) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100, optimizer=Optimizer(optim.Adam, max_grad_norm=5)): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = optimizer self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.input_vocab_file = os.path.join(self.expt_dir, 'input_vocab') self.output_vocab_file = os.path.join(self.expt_dir, 'output_vocab') self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model( input_variable, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() targets = other['inputs'] lengths = other['length'] for batch in range(len(targets)): # Batch wise loss batch_target = targets[batch] batch_len = lengths[batch] # Crop output and target to batch length batch_output = torch.stack( [output[batch] for output in decoder_outputs[:batch_len]]) batch_target = batch_target[:batch_len] # Evaluate loss loss.eval_batch(batch_output, batch_target) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _train_epoches(self, data, model, n_epochs, batch_size, resume, dev_data=None, teacher_forcing_ratio=0): start = time.time() print_loss_total = 0 # Reset every print_every steps_per_epoch = data.num_batches(batch_size) total_steps = steps_per_epoch * n_epochs # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer.set_parameters(model.parameters()) self.optimizer.load_state_dict( resume_checkpoint.optimizer_state_dict) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 self.optimizer.set_parameters(model.parameters()) for epoch in range(start_epoch, n_epochs + 1): data.shuffle(self.random_seed) batch_generator = data.make_batches(batch_size) # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 input_variables = batch[0] target_variables = batch[1] loss = self._train_batch(input_variables, target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss if step % self.print_every == 0: print_loss_avg = print_loss_total / (self.print_every) print_loss_total = 0 log_msg = 'Time elapsed: %s, Progress: %d%%, Train %s: %.4f' % ( pretty_interval(start), float(step) / total_steps * 100, self.loss.name, print_loss_avg) self.logger.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint( model=model, optimizer_state_dict=self.optimizer.state_dict(), epoch=epoch, step=step, input_vocab=data.input_vocab, output_vocab=data.output_vocab).save(self.expt_dir) log_msg = "Finished epoch {0}".format(epoch) if dev_data is not None: dev_loss = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f" % (self.loss.name, dev_loss) model.train(mode=True) self.logger.info(log_msg) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) """ # Make Checkpoint Directories data.input_vocab.save(self.input_vocab_file) data.output_vocab.save(self.output_vocab_file) self._train_epoches(data, model, num_epochs, self.batch_size, resume=resume, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio)
# train t = SupervisedTrainer( loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir, ) seq2seq = t.train( seq2seq, train, num_epochs=6, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume, ) evaluator = Evaluator(loss=loss, batch_size=32) dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) assert dev_loss < 1.5 beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 3)) predictor = Predictor(beam_search, input_vocab, output_vocab) inp_seq = "1 3 5 7 9" seq = predictor.predict(inp_seq.split()) assert " ".join(seq[:-1]) == inp_seq[::-1]
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (list, optional): list of seq2seq.loss.Loss objects for training (default: [seq2seq.loss.NLLLoss]) metrics (list, optional): list of seq2seq.metric.metric objects to be computed during evaluation batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of epochs to checkpoint after, (default: 100) print_every (int, optional): number of iterations to print after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=[NLLLoss()], loss_weights=None, metrics=[], batch_size=64, eval_batch_size=128, random_seed=None, checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) k = NLLLoss() self.loss = loss self.metrics = metrics self.loss_weights = loss_weights or len(loss) * [1.] self.evaluator = Evaluator(loss=self.loss, metrics=self.metrics, batch_size=eval_batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model( input_variable, input_lengths, target_variable['decoder_output'], teacher_forcing_ratio=teacher_forcing_ratio) losses = self.evaluator.compute_batch_loss(decoder_outputs, decoder_hidden, other, target_variable) # Backward propagation for i, loss in enumerate(losses[:-1], 0): loss.scale_loss(self.loss_weights[i]) loss.backward(retain_graph=True) losses[-1].scale_loss(self.loss_weights[-1]) losses[-1].backward() self.optimizer.step() model.zero_grad() return losses def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, monitor_data=[], teacher_forcing_ratio=0, top_k=5): log = self.logger print_loss_total = defaultdict(float) # Reset every print_every epoch_loss_total = defaultdict(float) # Reset every epoch epoch_loss_avg = defaultdict(float) print_loss_avg = defaultdict(float) device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 # store initial model to be sure at least one model is stored val_data = dev_data or data losses, metrics = self.evaluator.evaluate(model, val_data, self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) loss_best = top_k * [total_loss] best_checkpoints = top_k * [None] best_checkpoints[0] = model_name Checkpoint( model=model, optimizer=self.optimizer, epoch=start_epoch, step=start_step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save( self.expt_dir, name=model_name) for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths, target_variables = self.get_batch_data( batch) # compute batch loss losses = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss for loss in losses: name = loss.log_name print_loss_total[name] += loss.get_loss() epoch_loss_total[name] += loss.get_loss() # print log info according to print_every parm if step % self.print_every == 0 and step_elapsed > self.print_every: for loss in losses: name = loss.log_name print_loss_avg[ name] = print_loss_total[name] / self.print_every print_loss_total[name] = 0 train_log_msg = ' '.join([ '%s: %.4f' % (loss.log_name, loss.get_loss()) for loss in losses ]) m_logs = {} # compute vals for all monitored sets for m_data in monitor_data: losses, metrics = self.evaluator.evaluate( model, monitor_data[m_data], self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) m_logs[m_data] = ' '.join([ '%s: %.4f' % (loss.log_name, loss.get_loss()) for loss in losses ]) all_losses = ' '.join( ['%s %s' % (name, m_logs[name]) for name in m_logs]) log_msg = 'Progress %d%%, Train %s, %s' % ( step / total_steps * 100, train_log_msg, all_losses) log.info(log_msg) # check if new model should be saved if step % self.checkpoint_every == 0 or step == total_steps: # compute dev loss losses, metrics = self.evaluator.evaluate( model, val_data, self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) max_eval_loss = max(loss_best) if total_loss < max_eval_loss: index_max = loss_best.index(max_eval_loss) # rm prev model if best_checkpoints[index_max] is not None: shutil.rmtree( os.path.join(self.expt_dir, best_checkpoints[index_max])) best_checkpoints[index_max] = model_name loss_best[index_max] = total_loss # save model Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[ seq2seq.src_field_name].vocab, output_vocab=data.fields[ seq2seq.tgt_field_name].vocab).save( self.expt_dir, name=model_name) if step_elapsed == 0: continue for loss in losses: epoch_loss_avg[ loss.log_name] = epoch_loss_total[loss.log_name] / min( steps_per_epoch, step - start_step) epoch_loss_total[loss.log_name] = 0 loss_msg = ' '.join([ '%s: %.4f' % (loss.log_name, loss.get_loss()) for loss in losses ]) log_msg = "Finished epoch %d: Train %s" % (epoch, loss_msg) if dev_data is not None: losses, metrics = self.evaluator.evaluate( model, dev_data, self.get_batch_data) loss_total, log_, model_name = self.get_losses( losses, metrics, step) self.optimizer.update(loss_total, epoch) log_msg += ", Dev " + log_ model.train(mode=True) else: self.optimizer.update(sum(epoch_loss_avg.values()), epoch) # TODO check if this makes sense! log.info(log_msg) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, monitor_data={}, optimizer=None, teacher_forcing_ratio=0, learning_rate=0.001, checkpoint_path=None, top_k=5): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) learing_rate (float, optional): learning rate used by the optimizer (default 0.001) checkpoint_path (str, optional): path to load checkpoint from in case training should be resumed top_k (int): how many models should be stored during training Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 def get_optim(optim_name): optims = { 'adam': optim.Adam, 'adagrad': optim.Adagrad, 'adadelta': optim.Adadelta, 'adamax': optim.Adamax, 'rmsprop': optim.RMSprop, 'sgd': optim.SGD, None: optim.Adam } return optims[optim_name] self.optimizer = Optimizer(get_optim(optimizer)(model.parameters(), lr=learning_rate), max_grad_norm=5) self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, monitor_data=monitor_data, teacher_forcing_ratio=teacher_forcing_ratio, top_k=top_k) return model @staticmethod def get_batch_data(batch): input_variables, input_lengths = getattr(batch, seq2seq.src_field_name) target_variables = { 'decoder_output': getattr(batch, seq2seq.tgt_field_name) } return input_variables, input_lengths, target_variables @staticmethod def get_losses(losses, metrics, step): total_loss = 0 model_name = '' log_msg = '' for metric in metrics: val = metric.get_val() log_msg += '%s %.4f ' % (metric.name, val) model_name += '%s_%.2f_' % (metric.log_name, val) for loss in losses: val = loss.get_loss() log_msg += '%s %.4f ' % (loss.name, val) model_name += '%s_%.2f_' % (loss.log_name, val) total_loss += val model_name += 's%d' % step return total_loss, log_msg, model_name
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: model_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, model_dir='experiment', best_model_dir='experiment/best', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100, max_epochs=5, max_steps=10000, max_checkpoints_num=5, best_ppl=100000.0, device=None): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every self.max_steps = max_steps self.max_epochs = max_epochs self.batch_size = batch_size self.best_ppl = best_ppl self.max_checkpoints_num = max_checkpoints_num self.device = device self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size, device=device) if not os.path.isabs(model_dir): model_dir = os.path.join(os.getcwd(), model_dir) self.model_dir = model_dir if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.isabs(best_model_dir): best_model_dir = os.path.join(os.getcwd(), best_model_dir) self.best_model_dir = best_model_dir if not os.path.exists(self.best_model_dir): os.makedirs(self.best_model_dir) self.model_checkpoints = [] self.best_model_checkpoints = [] self.logger = logging.getLogger(__name__) def save_model(self, model, steps, dev_ppl=None): model_fn = f"{steps}.pt" model_fp = os.path.join(self.model_dir, model_fn) # save model checkpoints while len(self.model_checkpoints) >= self.max_checkpoints_num: os.system(f"rm {self.model_checkpoints[0]}") self.model_checkpoints = self.model_checkpoints[1:] torch.save(model.state_dict(), model_fp) self.model_checkpoints.append(model_fp) # update checkpoints file with open(os.path.join(self.model_dir, "checkpoints"), 'w') as f: f.write('\n'.join(self.model_checkpoints[::-1])) # save best model checkpoints if dev_ppl and dev_ppl < self.best_ppl: self.logger.info(f"Best model dev ppl {dev_ppl}.") self.best_ppl = dev_ppl while len(self.best_model_checkpoints) >= self.max_checkpoints_num: os.system(f"rm {self.best_model_checkpoints[0]}") self.best_model_checkpoints = self.best_model_checkpoints[1:] best_model_fp = os.path.join(self.best_model_dir, model_fn) os.system(f"cp {model_fp} {best_model_fp}") self.best_model_checkpoints.append(best_model_fp) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model( input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _train_epoches(self, data, model, start_step, dev_data=None, teacher_forcing_ratio=0): device = self.device log = self.logger max_epochs = self.max_epochs max_steps = self.max_steps print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch step = 0 steps_per_epoch = len(data) start_epoch = (start_step - step) // steps_per_epoch step = start_epoch * steps_per_epoch for batch in data: if step >= start_step: break step += 1 if start_epoch or start_step: logging.info(f"Resume from Epoch {start_epoch}, Step {start_step}") for epoch in range(start_epoch, max_epochs): model.train(True) for batch in data: step += 1 src_variables = batch['src'].to(device) tgt_variables = batch['tgt'].to(device) src_lens = batch['src_len'].view(-1).to(device) tgt_lens = batch['tgt_len'].view(-1).to(device) # print(src_variables, src_lens, tgt_variables) # exit(0) loss = self._train_batch(src_variables, src_lens.tolist(), tgt_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = f"Process {100.0*(step%steps_per_epoch)/steps_per_epoch:.2f}% of Epoch {epoch}, Total step {step}, Train {self.loss.name} {print_loss_avg:.4f}" if hvd.rank() == 0: log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0: dev_loss = None if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate( model, dev_data) log_msg = f"Dev {self.loss.name}: {dev_loss:.4f}, Accuracy: {accuracy:.4f}" if hvd.rank() == 0: log.info(log_msg) model.train(mode=True) if hvd.rank() == 0: self.save_model(model, step, dev_ppl=dev_loss) if step >= max_steps: break if step >= max_steps: if hvd.rank() == 0: log.info(f"Finish max steps {max_steps} at Epoch {epoch}.") break epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = f"Finished Epoch {epoch}, Train {self.loss.name} {epoch_loss_avg:.4f}" if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += f", Dev {self.loss.name}: {dev_loss:.4f}, Accuracy: {accuracy:.4f}" model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) if hvd.rank() == 0: self.save_model(model, step, dev_ppl=dev_loss) log.info(log_msg) log.info(f"Finish Epoch {epoch}, Total steps {step}.") def train(self, model, data, start_step=0, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, start_step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
class SupervisedTrainer(object): """The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: experiment_directory (optional, str): directory to store experiments in loss (seq2seq.loss.loss.Loss, optional): loss for training batch_size (int, optional): batch size for experiment checkpoint_every (int, optional): number of batches to checkpoint after """ def __init__(self, experiment_directory='./experiment', loss=None, batch_size=64, random_seed=None, checkpoint_every=100, print_every=100): if loss is None: loss = NLLLoss() if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every self.batch_size = batch_size self.experiment_directory = experiment_directory if not os.path.exists(self.experiment_directory): os.makedirs(self.experiment_directory) def train(self, model, data, n_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """Train a given model. Args: model (seq2seq.models): model to run training on. If resume=True, it will be overwritten by the model loaded from the latest checkpoint data (seq2seq.dataset.dataset.Dataset): dataset object to train on n_epochs (int): number of epochs to run resume(bool): resume training with the latest checkpoint dev_data (seq2seq.dataset.dataset.Dataset): dev Dataset optimizer (seq2seq.optim.Optimizer): optimizer for training teacher_forcing_ratio (float): teaching forcing ratio Returns: model (seq2seq.models): trained model. """ if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.experiment_directory) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A work-around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer( optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer logger.info('Optimizer: %s, Scheduler: %s', self.optimizer.optimizer, self.optimizer.scheduler) self._train_epochs(data, model, n_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model def _train_epochs(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): print_loss_total = epoch_loss_total = 0 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False, ) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): logger.debug('Epoch: %d, Step: %d', epoch, step) batch_generator = iter(batch_iterator) # Consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train() progress_bar = tqdm( batch_generator, total=steps_per_epoch, desc='Train {}: '.format(self.loss.name), ) for batch in progress_bar: step += 1 step_elapsed += 1 loss = self._train_batch( batch, model, teacher_forcing_ratio, data, ) print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 \ and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 progress_bar.set_description('Train {}: {:.4f}'.format( self.loss.name, print_loss_avg, )) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint( model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab, ).save(self.experiment_directory) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min( steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = 'Finished epoch {:d}: Train {}: {:.4f}'.format( epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ', Dev {}: {:.4f}, Accuracy: {:.4f}'.format( self.loss.name, dev_loss, accuracy) model.train() else: self.optimizer.update(epoch_loss_avg, epoch) logger.info(log_msg) def _train_batch(self, batch, model, teacher_forcing_ratio, dataset): # Forward propagation output, _, _ = model( batch, dataset=dataset, teacher_forcing_ratio=teacher_forcing_ratio, ) # Get loss self.loss.reset() self.loss.eval_batch(output, batch) # Backward propagation model.zero_grad() self.loss.backward() self.optimizer.step() return self.loss.get_loss()
logging.info('Start adversarial training') g_step, d_step = 0, 0 for epoch in range(1, 20): logging.info('[Epoch %d]: train generator' % epoch) # train generator g_step = g_trainer.train(gen, dis, adv_train_iter, dev_data=adv_dev, optimizer=g_optimizer, step=g_step) # evalutate generator dev_loss, accuracy = g_evaluator.evaluate(gen, adv_dev) logging.info('Dev %s: %.4f, Accuracy: %.4f' % ('gen NLLloss', dev_loss, accuracy)) logger.scalar_summary('G-NLLloss', dev_loss, epoch) # train discriminator logging.info('[Epoch %d]: train discriminator' % epoch) samples = [ sample for sample, _, _, _ in g_trainer.gen_sample( gen, adv_train_iter, num_src=256, src2sample=1) ] batch = next(iter(real_iter)) reals = batch.tgt.data[:, 1:] # 裁掉<sos> for _epoch in range(1, 20 + 1): _train_iter = helper.batch_gen(samples,
format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) # Prepare loss weight = torch.ones(len(output_vocab)) pad = output_vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() ################################################################################# # Evaluate model on test set evaluator = Evaluator(loss=loss, batch_size=opt.batch_size) losses, accuracy, totals = evaluator.evaluate(seq2seq, test) prec_tot = 0 rec_tot = 0 f1_tot = 0 for key in totals: precision = totals[key]['tp'] / (totals[key]['tp'] + totals[key]['fp']) prec_tot += precision recall = totals[key]['tp'] / (totals[key]['tp'] + totals[key]['fn']) rec_tot += recall f1 = 2 * (precision * recall) / (precision + recall) f1_tot += f1 # print(key + " precision: " + str(precision) + " recall: " + str(recall) + " f1: " + str(f1)) print(key + " & " + str("{0:.3f}".format(round(precision, 2))) + " & " + str("{0:.3f}".format(round(recall, 2))) + " & " + str("{0:.3f}".format(round(f1, 2))) + " \\\\")
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (list, optional): list of seq2seq.loss.Loss objects for training (default: [seq2seq.loss.NLLLoss]) metrics (list, optional): list of seq2seq.metric.metric objects to be computed during evaluation batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of epochs to checkpoint after, (default: 100) print_every (int, optional): number of iterations to print after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=[NLLLoss()], loss_weights=None, metrics=[], batch_size=64, eval_batch_size=128, random_seed=None, checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) k = NLLLoss() self.loss = loss self.metrics = metrics self.loss_weights = loss_weights or len(loss)*[1.] self.evaluator = Evaluator(loss=self.loss, metrics=self.metrics, batch_size=eval_batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model(input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) losses = self.evaluator.compute_batch_loss(decoder_outputs, decoder_hidden, other, target_variable) # Backward propagation for i, loss in enumerate(losses, 0): loss.scale_loss(self.loss_weights[i]) loss.backward(retain_graph=True) self.optimizer.step() model.zero_grad() return losses def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, callbacks=[], monitor_data=[], teacher_forcing_ratio=0, top_k=5): self.set_callbacks(callbacks, top_k=top_k, data=data, dev_data=dev_data) iterator_device = torch.cuda.current_device() if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=iterator_device, repeat=False) val_data = dev_data or data steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs info = dict(zip(['steps_per_epoch', 'total_steps'],\ [steps_per_epoch, total_steps])) info['step'] = start_step info['start_epoch'] = start_epoch info['epoch'] = start_epoch info['start_step'] = start_step info['step_elapsed'] = 0 info['model'] = model # TODO I find this also a bit hacky # store initial model to be sure at least one model is stored val_data = dev_data or data losses, metrics = self.evaluator.evaluate(model, val_data, self.get_batch_data) info['losses'] = losses info['metrics'] = metrics self.callbacks.on_train_begin(info) # TODO this should also be in a callback logs = Log() for epoch in range(start_epoch, n_epochs + 1): self.callbacks.on_epoch_begin(epoch, info) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, info['step']): next(batch_generator) model.train(True) for batch in batch_generator: self.callbacks.on_batch_begin(info, batch) info['step'] += 1 info['step_elapsed'] += 1 input_variables, input_lengths, target_variables = self.get_batch_data(batch) losses = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) info['losses'] = losses # print log info according to print_every parm if info['step'] % self.print_every == 0 and info['step_elapsed'] >= self.print_every: m_logs = {} train_losses, train_metrics = self.evaluator.evaluate(model, data, self.get_batch_data) train_loss, train_log_msg, model_name = self.get_losses(train_losses, train_metrics, info['step']) logs.write_to_log('Train', train_losses, train_metrics, info['step']) logs.update_step(info['step']) m_logs['Train'] = train_log_msg # compute vals for all monitored sets for m_data in monitor_data: m_losses, m_metrics = self.evaluator.evaluate(model, monitor_data[m_data], self.get_batch_data) total_loss, log_msg, model_name = self.get_losses(m_losses, m_metrics, info['step']) m_logs[m_data] = log_msg logs.write_to_log(m_data, m_losses, m_metrics, info['step']) all_losses = ' '.join(['%s:\t %s\n' % (os.path.basename(name), m_logs[name]) for name in m_logs]) log_msg = 'Progress %d%%, %s' % ( info['step'] / total_steps * 100, all_losses) info['log_msg'] = log_msg # check if new model should be saved if info['step'] % self.checkpoint_every == 0 or info['step'] == total_steps: # compute dev loss losses, metrics = self.evaluator.evaluate(model, val_data, self.get_batch_data) info['val_losses'] = losses self.callbacks.on_batch_end(batch, info) if info['step_elapsed'] == 0: continue log_msg = '' if dev_data is not None: losses, metrics = self.evaluator.evaluate(model, dev_data, self.get_batch_data) loss_total, log_, model_name = self.get_losses(losses, metrics, info['step']) self.optimizer.update(loss_total, epoch) # TODO check if this makes sense! log_msg += ", Dev set: " + log_ model.train(mode=True) else: # TODO THIS IS SUPER HACKY, UPDATE IT!!! self.optimizer.update(self.callbacks.callbacks[0].epoch_loss_avg, epoch) self.callbacks.on_epoch_end(epoch, info) info['epoch'] += 1 self.callbacks.on_train_end(info) return logs def train(self, model, data, num_epochs=5, resume=False, dev_data=None, monitor_data={}, optimizer=None, teacher_forcing_ratio=0, learning_rate=0.001, checkpoint_path=None, top_k=5): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) learing_rate (float, optional): learning rate used by the optimizer (default 0.001) checkpoint_path (str, optional): path to load checkpoint from in case training should be resumed top_k (int): how many models should be stored during training Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 def get_optim(optim_name): optims = {'adam': optim.Adam, 'adagrad': optim.Adagrad, 'adadelta': optim.Adadelta, 'adamax': optim.Adamax, 'rmsprop': optim.RMSprop, 'sgd': optim.SGD, None:optim.Adam} return optims[optim_name] self.optimizer = Optimizer(get_optim(optimizer)(model.parameters(), lr=learning_rate), max_grad_norm=5) self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) logs = self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, monitor_data=monitor_data, teacher_forcing_ratio=teacher_forcing_ratio, top_k=top_k) return model, logs def set_callbacks(self, callbacks, data, dev_data, top_k): """ Create a callback collection and associate it with the current trainer. """ # every training outcomes are logged and models callbacks = [Logger(), ModelCheckpoint(data, dev_data, top_k=top_k)] + callbacks self.callbacks = CallbackContainer(callbacks) self.callbacks.set_trainer(self) @staticmethod def get_batch_data(batch): input_variables, input_lengths = getattr(batch, seq2seq.src_field_name) target_variables = {'decoder_output': getattr(batch, seq2seq.tgt_field_name), 'encoder_input': input_variables} # The k-grammar metric needs to have access to the inputs # If available, also get provided attentive guidance data if hasattr(batch, seq2seq.attn_field_name): attention_target = getattr(batch, seq2seq.attn_field_name) # When we ignore output EOS, the sequence target will not contain the EOS, but if present # in the data, the attention indices might still. We should remove this. target_length = target_variables['decoder_output'].size(1) attn_length = attention_target.size(1) # If the attention sequence is exactly 1 longer than the output sequence, the EOS attention # index is present. if attn_length == target_length + 1: # First we replace each of these indices with a -1. This makes sure that the hard # attention method will not attend to an input that might not be present (the EOS) # We need this if there are attentions of multiple lengths in a bath attn_eos_indices = input_lengths.unsqueeze(1) + 1 attention_target = attention_target.scatter_(dim=1, index=attn_eos_indices, value=-1) # Next we also make sure that the longest attention sequence in the batch is truncated attention_target = attention_target[:, :-1] target_variables['attention_target'] = attention_target return input_variables, input_lengths, target_variables @staticmethod def get_losses(losses, metrics, step): total_loss = 0 model_name = '' log_msg= '' for metric in metrics: val = metric.get_val() log_msg += '%s %.4f ' % (metric.name, val) model_name += '%s_%.2f_' % (metric.log_name, val) for loss in losses: val = loss.get_loss() log_msg += '%s %.4f ' % (loss.name, val) model_name += '%s_%.2f_' % (loss.log_name, val) total_loss += val model_name += 's%d' % step return total_loss, log_msg, model_name
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100, input_vocab=None, output_vocab=None): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size, input_vocab=input_vocab) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every self.input_vocab = input_vocab self.output_vocab = output_vocab if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model( input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = torch.device('cuda:0') if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=False, device=device, repeat=False, shuffle=True) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 best_acc = 0 # to track the training loss as the model trains train_losses = [] # to track the average training loss per epoch as the model trains avg_train_losses = [] # to track the average validtation loss per epoch as the model trains avg_valid_losses = [] early_stopping = EarlyStopping(patience=7, verbose=True) for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 target_variables = getattr(batch, 'tgt') pos_input_variables = [[] for i in range(batch.batch_size)] pos_input_lengths = [[] for i in range(batch.batch_size)] neg_input_variables = [[] for i in range(batch.batch_size)] neg_input_lengths = [[] for i in range(batch.batch_size)] set_size = len(batch.fields) - 1 max_len_within_batch = -1 for idx in range(batch.batch_size): for src_idx in range(1, int(set_size / 2) + 1): src, src_len = getattr(batch, 'pos{}'.format(src_idx)) pos_input_variables[idx].append(src[idx]) pos_input_lengths[idx].append(src_len[idx]) for src_idx in range(1, int(set_size / 2) + 1): src, src_len = getattr(batch, 'neg{}'.format(src_idx)) neg_input_variables[idx].append(src[idx]) neg_input_lengths[idx].append(src_len[idx]) pos_input_lengths[idx] = torch.stack( pos_input_lengths[idx], dim=0) neg_input_lengths[idx] = torch.stack( neg_input_lengths[idx], dim=0) if max_len_within_batch < torch.max( pos_input_lengths[idx].view(-1)).item(): max_len_within_batch = torch.max( pos_input_lengths[idx].view(-1)).item() if max_len_within_batch < torch.max( neg_input_lengths[idx].view(-1)).item(): max_len_within_batch = torch.max( neg_input_lengths[idx].view(-1)).item() for batch_idx in range(len(pos_input_variables)): for set_idx in range(int(set_size / 2)): pos_input_variables[batch_idx][set_idx] = pad_tensor( pos_input_variables[batch_idx][set_idx], max_len_within_batch, self.input_vocab) neg_input_variables[batch_idx][set_idx] = pad_tensor( neg_input_variables[batch_idx][set_idx], max_len_within_batch, self.input_vocab) pos_input_variables[batch_idx] = torch.stack( pos_input_variables[batch_idx], dim=0) neg_input_variables[batch_idx] = torch.stack( neg_input_variables[batch_idx], dim=0) pos_input_variables = torch.stack(pos_input_variables, dim=0) pos_input_lengths = torch.stack(pos_input_lengths, dim=0) neg_input_variables = torch.stack(neg_input_variables, dim=0) neg_input_lengths = torch.stack(neg_input_lengths, dim=0) input_variables = (pos_input_variables, neg_input_variables) input_lengths = (pos_input_lengths, neg_input_lengths) loss = self._train_batch(input_variables, input_lengths, target_variables, model, teacher_forcing_ratio) train_losses.append(loss) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) train_loss = np.average(train_losses) avg_train_losses.append(train_loss) # clear lists to track next epoch train_losses = [] if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f, %.4f" % ( epoch, self.loss.name, epoch_loss_avg, train_loss) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) avg_valid_losses.append(dev_loss) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) early_stopping(dev_loss, model, self.optimizer, epoch, step, self.input_vocab, self.output_vocab, self.expt_dir) self.optimizer.update(dev_loss, epoch) if accuracy > best_acc: log.info( 'accuracy increased >> best_accuracy{}, current_accuracy{}' .format(accuracy, best_acc)) best_acc = accuracy model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) if early_stopping.early_stop: print("Early Stopping") break log.info(log_msg) return avg_train_losses, avg_valid_losses def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) train_loss, valid_loss = self._train_epoches( data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) visualize_loss(train_loss, valid_loss, self.expt_dir) return model
tgt = TargetField() src.vocab = input_vocab tgt.vocab = output_vocab max_len = opt.max_len def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len # generate test set test = torchtext.data.TabularDataset( path=opt.test_data, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) # Prepare loss weight = torch.ones(len(output_vocab)) pad = output_vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() ################################################################################# # Evaluate model on test set evaluator = Evaluator(loss=loss, batch_size=opt.batch_size) loss, accuracy = evaluator.evaluate(seq2seq, test) print("Loss: %f, accuracy: %f" % (loss, accuracy))
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of epochs to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model(input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr(batch, seq2seq.src_field_name) target_variables = getattr(batch, seq2seq.tgt_field_name) loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % (epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % (self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, model, batch, vocab, num_antecedents=400, teacher_forcing_ratio=0): loss = self.loss # Forward propagation result, target, _ = model(batch, vocab, num_antecedents) # Get loss loss.reset() result = [torch.log(sample) for sample in result] for i in range(len(result)): for j in range(len(result[i])): if target[i][j] > 0 and target[i][j] < 400: arg1 = result[i][j].unsqueeze(0) arg2 = torch.tensor([target[i][j]]) if torch.cuda.is_available(): arg2 = arg2.cuda() loss.acc_loss += loss.criterion(arg1, arg2) loss.norm_term += 1 # Backward propagation model.zero_grad() lvalue = loss.get_loss() if lvalue >= 0: loss.backward() self.optimizer.step() else: raise AssertionError("NAN Triggered!") return lvalue def _train_epoches(self, data_train, data_valid, model, n_epochs, start_epoch, start_step, save_file=False, dev_data=None, teacher_forcing_ratio=0, log_dir=None, embed_file=None, num_antecedents=400): log = self.logger # embed = Embed(embed_file) embed = [] print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch plan_loss_total = 0 construct_loss_total = 0 device = torch.device('cuda', 0) if torch.cuda.is_available() else None batch_generator = data_train.__iter__() steps_per_epoch = len(batch_generator) print("Steps per epoch: ", steps_per_epoch) total_steps = steps_per_epoch * n_epochs """ dev_loss, recall, precision, F = self.evaluator.evaluate(model, data_train, log_dir=log_dir, cur_step=0) exit(0) """ step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log_file = open(log_dir + "/log.txt", "a+") print("Epoch: %d, Step: %d" % (epoch, step)) # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 all_loss = self._train_batch(model, batch, data_train.vocab, num_antecedents=num_antecedents) loss = all_loss # FOR NAN DEBUG if not loss >= 0: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data_train.vocab).save( self.expt_dir) print("Nan Triggered! Model has been saved.") exit(0) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Step %d, Progress: %d%%, Train %s: %.2f, ' % ( step, step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) if step % 200 == 0: if log_file: log_file.write("Step " + str(step) + '\n') dev_loss, recall, precision, F = self.evaluator.evaluate( model, data_valid, cur_step=step, log_dir=log_dir, log_file=log_file) # self.optimizer.update(dev_loss, epoch) # log_msg = "Step %d, Dev %s: %.4f, Accuracy: %.4f" % (step, self.loss.name, dev_loss, accuracy) log_msg = "Step %d, Dev %s: %.4f, Recall: %.4f, Precision: %.4f, F: %.4f" % ( step, self.loss.name, dev_loss, recall, precision, F) log.info(log_msg) model.train(mode=True) # Checkpoint """ if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save(self.expt_dir) """ if epoch % 1 == 0 and save_file: Checkpoint( model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[ seq2seq.tgt_field_name].vocab).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) plan_loss_avg = plan_loss_total / min(steps_per_epoch, step - start_step) construct_loss_avg = construct_loss_total / min( steps_per_epoch, step - start_step) epoch_loss_total = 0 plan_loss_total = 0 construct_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f Plan Loss: %.4f" % ( epoch, self.loss.name, epoch_loss_avg, plan_loss_avg) if dev_data is not None: if log_file: log_file.write("Step " + str(step) + '\n') log_file.write("Train Average Loss: " + str(epoch_loss_avg) + " Plan Loss: " + str(plan_loss_avg) + " Construct Loss: " + str(construct_loss_avg) + '\n') dev_loss, accuracy, = self.evaluator.evaluate( model, dev_data, log_dir=log_dir, cur_step=step, log_file=log_file) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) log_file.close() Checkpoint(model=model, optimizer=self.optimizer, epoch=n_epochs, step=step, input_vocab=data_train.vocab).save(self.expt_dir) def train(self, model, data_train, data_valid, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0, src_vocab=None, cpt_vocab=None, tgt_vocab=None, use_concept=False, vocabs=None, save_file=False, log_dir=None, embed_file=None, num_antecedents=400): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. :param log_dir: """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters(), weight_decay=0), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data_train, data_valid, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, log_dir=log_dir, embed_file=embed_file, save_file=save_file, num_antecedents=num_antecedents) return model
label_word2index = checkpoint.label_word2index label_index2word = checkpoint.label_index2word if not os.path.isfile(opt.gaussian_dict_path): print('calculating means and stds of box positions and sizes...') get_class_sta(opt.train_path, opt.gaussian_dict_path) gaussian_dict = np.load(opt.gaussian_dict_path).item() hidden_size = opt.embedding_dim encoder = PreEncoderRNN(len(cap_word2index), nhidden=opt.embedding_dim) state_dict = torch.load(opt.encoder_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(state_dict) encoder.eval() if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # prepare dataset dev_cap_lang, dev_label_lang, dev_tuples, x_mean_std, y_mean_std, w_mean_std, r_mean_std, \ keys = prepare_test_data(opt.dev_path, opt.mean_std_path, opt.max_len, opt.min_len, cap_word2index, cap_index2word, label_word2index, label_index2word, opt.dev_filename_path) evaluator = Evaluator(opt.batch_size, opt.early_stop_len, opt.expt_dir, dev_cap_lang, dev_label_lang, x_mean_std, y_mean_std, w_mean_std, r_mean_std, gaussian_dict, opt.box_saving_folder, opt.output_opt) evaluator.evaluate(encoder, decoder, dev_tuples, keys)
def run_training(opt, default_data_dir, num_epochs=100): if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt') logging.info("Starting new Training session on %s", data_file) def len_filter(example): return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \ and (len(example.src) > 0) and (len(example.tgt) > 0) train = torchtext.data.TabularDataset( path=data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) dev = None if opt.no_dev is False: dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt') dev = torchtext.data.TabularDataset( path=dev_data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): logging.info("Yayyy We got CUDA!!!") loss.cuda() else: logging.info("No cuda available device found running on cpu") seq2seq = None optimizer = None if not opt.resume: hidden_size = 128 decoder_hidden_size = hidden_size * 2 logging.info("EncoderRNN Hidden Size: %s", hidden_size) logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size) bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=0, use_attention=True, bidirectional=bidirectional, rnn_cell='lstm', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train num_epochs = num_epochs batch_size = 32 checkpoint_every = num_epochs / 10 print_every = num_epochs / 100 properties = dict(batch_size=batch_size, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir, num_epochs=num_epochs, teacher_forcing_ratio=0.5, resume=opt.resume) logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2)) t = SupervisedTrainer(loss=loss, batch_size=num_epochs, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) evaluator = Evaluator(loss=loss, batch_size=batch_size) if opt.no_dev is False: dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) logging.info("Dev Loss: %s", dev_loss) logging.info("Accuracy: %s", dev_loss) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) while True: try: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() results = predictor.predict_n(seq, n=3) for i, res in enumerate(results): print('option %s: %s\n', i + 1, res) except KeyboardInterrupt: logging.info("Bye Bye") exit(0)
class SupervisedAdversarialTrainer(object): """ Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=1000, print_every=100, tensorboard=True, batch_adv_loss=NLLLoss()): self._trainer = "Adversarial Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) self.writer = SummaryWriter(log_dir=expt_dir) if tensorboard else None self.batch_adv_loss = batch_adv_loss def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model( input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _get_best_attack(self, batch, model, attacks): if attacks is None or len(attacks) == 0: return seq2seq.src_field_name, -1, {} else: model.eval() loss = self.batch_adv_loss d = {} with torch.no_grad(): for attack in attacks: input_variables, input_lengths = getattr(batch, attack) target_variables = getattr(batch, seq2seq.tgt_field_name) decoder_outputs, decoder_hidden, other = model( input_variables, input_lengths.tolist(), target_variables) loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variables.size(0) loss.eval_batch( step_output.contiguous().view(batch_size, -1), target_variables[:, step + 1]) d[attack] = loss.get_loss() model.train() best_loss = max(d.values()) best_attack = max(d, key=d.get) return best_attack, best_loss, d def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0, attacks=None, lamb=0.0): # Train adversarially with lamb*normal loss + (1-lamb)*adv_loss # lamb should either be a float or a list of floats of length (n_epochs+1-start_epoch) log = self.logger if isinstance(lamb, float): lamb = [lamb] * (n_epochs + 1 - start_epoch) print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs self.print_every = steps_per_epoch // 25 log.info('Steps per epoch: %d' % steps_per_epoch) log.info('Total steps: %d' % total_steps) step = start_step step_elapsed = 0 # num_checkpoints = 25 # self.checkpoint_every = (total_steps+1)//num_checkpoints if attacks is not None: chosen_attack_counts = {x: 0 for x in attacks} if start_step > 0 and dev_data is not None: d = self.evaluator.evaluate(model, dev_data) dev_loss = d['metrics']['Loss'] accuracy = d['metrics']['accuracy (torch)'] other_metrics = d['metrics'] best_f1 = other_metrics['f1'] best_acc = accuracy else: best_f1 = 0.0 best_acc = 0.0 lidx = 0 for epoch in range(start_epoch, n_epochs + 1): lamb_epoch = lamb[lidx] if lidx < len(lamb) else lamb[-1] lidx += 1 log.info("Epoch: %d, Step: %d, Lambda: %.2f" % (epoch, step, lamb_epoch)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 chosen_src_field_name, max_loss, d = self._get_best_attack( batch, model, attacks) if attacks is not None and len(attacks) > 0: chosen_attack_counts[chosen_src_field_name] += 1 # print(chosen_src_field_name, max_loss, d) # exit() self.loss.reset() if lamb_epoch > 0: # normal training term input_variables, input_lengths = getattr( batch, seq2seq.src_field_name) target_variables = getattr(batch, seq2seq.tgt_field_name) decoder_outputs, decoder_hidden, other = model( input_variables, input_lengths, target_variables, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss for step1, step_output in enumerate(decoder_outputs): batch_size = target_variables.size(0) self.loss.eval_batch(step_output.contiguous().view( batch_size, -1), target_variables[:, step1 + 1], weight=lamb_epoch) # adversarial training term input_variables, input_lengths = getattr( batch, chosen_src_field_name) target_variables = getattr(batch, seq2seq.tgt_field_name) decoder_outputs, decoder_hidden, other = model( input_variables, input_lengths, target_variables, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss for step2, step_output in enumerate(decoder_outputs): batch_size = target_variables.size(0) self.loss.eval_batch(step_output.contiguous().view( batch_size, -1), target_variables[:, step2 + 1], weight=(1 - lamb_epoch)) model.zero_grad() self.loss.backward() self.optimizer.step() loss_adv = self.loss.get_loss() # loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss_adv epoch_loss_total += loss_adv if step % self.print_every == 0 and step_elapsed >= self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Epoch: %d, Step: %d, Progress: %d%%, Train %s: %.4f' % ( epoch, step, step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) if self.writer: self.writer.add_scalar('Train/loss_step', print_loss_avg, step) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % ( epoch, self.loss.name, epoch_loss_avg) self.writer.add_scalar('Train/loss_epoch', epoch_loss_avg, epoch) other_metrics = {} if dev_data is not None: d = self.evaluator.evaluate(model, dev_data) dev_loss = d['metrics']['Loss'] accuracy = d['metrics']['accuracy (torch)'] other_metrics = d['metrics'] self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) self.writer.add_scalar('Val/loss', dev_loss, epoch) self.writer.add_scalar('Val/acc', accuracy, epoch) for metric in other_metrics: try: log_msg += ", %s: %.4f" % (metric.replace( ' ', '_').replace('-', '_'), other_metrics[metric]) self.writer.add_scalar( 'Val/%s' % metric.replace(' ', '_').replace('-', '_'), other_metrics[metric], epoch) except: continue log.info(log_msg) if other_metrics['f1'] > best_f1: Checkpoint( model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[ seq2seq.tgt_field_name].vocab).save(self.expt_dir, name='Best_F1') log_msg = 'Checkpoint saved, Epoch %d, Prev Val F1: %.4f, New Val F1: %.4f' % ( epoch, best_f1, other_metrics['f1']) log.info(log_msg) best_f1 = other_metrics['f1'] # if accuracy > best_acc: # Checkpoint(model=model, # optimizer=self.optimizer, # epoch=epoch, step=step, # input_vocab=data.fields[seq2seq.src_field_name].vocab, # output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save(self.expt_dir, name='Best_Acc') # log_msg = 'Checkpoint saved, Epoch %d, Prev Val Acc: %.4f, New Val Acc: %.4f' % (epoch, best_acc, accuracy) # log.info(log_msg) # best_acc = accuracy model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) Checkpoint( model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save( self.expt_dir, name='Latest') log_msg = 'Latest Checkpoint saved, Epoch %d, %s' % ( epoch, str(other_metrics)) log.info(log_msg) log.info(str(chosen_attack_counts)) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0, load_checkpoint=None, attacks=None, lamb=0.5): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: if load_checkpoint is None: load_checkpoint = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(load_checkpoint) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step self.logger.info("Resuming training from %d epoch, %d step" % (start_epoch, step)) else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, start_epoch + num_epochs if resume else num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, attacks=attacks, lamb=lamb) return model
class SupervisedTrainer(object): """ The SupervisedTrainer class helps in setting up a training framework in a supervised setting. Args: expt_dir (optional, str): experiment Directory to store details of the experiment, by default it makes a folder in the current directory to store the details (default: `experiment`). loss (seq2seq.loss.loss.Loss, optional): loss for training, (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for experiment, (default: 64) checkpoint_every (int, optional): number of batches to checkpoint after, (default: 100) """ def __init__(self, expt_dir='experiment', loss=NLLLoss(), batch_size=64, random_seed=None, checkpoint_every=100, print_every=100): self._trainer = "Simple Trainer" self.random_seed = random_seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) self.loss = loss self.evaluator = Evaluator(loss=self.loss, batch_size=batch_size) self.optimizer = None self.checkpoint_every = checkpoint_every self.print_every = print_every if not os.path.isabs(expt_dir): expt_dir = os.path.join(os.getcwd(), expt_dir) self.expt_dir = expt_dir if not os.path.exists(self.expt_dir): os.makedirs(self.expt_dir) self.batch_size = batch_size self.logger = logging.getLogger(__name__) def _train_batch(self, input_variable, input_lengths, target_variable, model, teacher_forcing_ratio): loss = self.loss # Forward propagation decoder_outputs, decoder_hidden, other = model(input_variable, input_lengths, target_variable, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_variable.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_variable[:, step + 1]) # Backward propagation model.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr(batch, seq2seq.src_field_name) target_variables = getattr(batch, seq2seq.tgt_field_name) loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[seq2seq.src_field_name].vocab, output_vocab=data.fields[seq2seq.tgt_field_name].vocab).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % (epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % (self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
# input_vocab=input_vocab, # output_vocab=output_vocab, # use_output_eos=output_eos_used, # input_pad_symbol=src.pad_token, # output_sos_symbol=tgt.SYM_SOS, # output_pad_symbol=tgt.pad_token, # output_eos_symbol=tgt.SYM_EOS, # output_unk_symbol=tgt.unk_token)) data_func = SupervisedTrainer.get_batch_data ########################################################################## # Evaluate model on test set <<<<<<< HEAD evaluator = Evaluator(loss=loss, batch_size=opt.batch_size) losses, metrics = evaluator.evaluate(seq2seq, test) total_loss, log_msg, _ = SupervisedTrainer.print_eval(losses, metrics, 0) print(log_msg) ======= evaluator = Evaluator(batch_size=opt.batch_size, loss=losses, metrics=metrics) losses, metrics = evaluator.evaluate( model=seq2seq, data=test, get_batch_data=data_func) total_loss, log_msg, _ = SupervisedTrainer.get_losses(losses, metrics, 0) logging.info(log_msg) >>>>>>> upstream/master