def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def train(self, model, data, teacher_model=None, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('param', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, teacher_model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
validation_float_data = scaler.transform(validation_data) validation_array_list.append(validation_float_data) seqence_len = 72 output_dim = 3 delay = 36 if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model.to(device) else: # Prepare dataset train = WFDataset(train_array_list, delay=delay, seq_len=seqence_len, outdim=3, transform=transforms.Compose([ToTensor()])) dev = WFDataset(validation_array_list, delay=delay, seq_len=seqence_len, outdim=3, transform=transforms.Compose([ToTensor()]), begin_index=100) if opt.use_custome_loss:
def _train_epoches(self, data, model, n_epochs, dev_data=None, test_data=None): labeled_dataset = torchtext.data.Dataset(data, fields=[('text', self.TEXT_field), ('label', self.LABEL_field)]) label_batch_iter = torchtext.data.BucketIterator( dataset=labeled_dataset, batch_size=128, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self.device, repeat=False, shuffle=True) log = self.logger early_stopping = EarlyStopping(patience=2, verbose=True) best_accuracy = 0 for epoch in range(0, n_epochs): model.train() loss_total = 0 step = 0 for batch in label_batch_iter: input_variables, input_lengths = batch.text target_variables = batch.label loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model) loss_total += loss.item() step += 1 del loss, batch epoch_loss_avg = loss_total / step log_msg = "Finished epoch %d: SSL Train %s: %.4f" % ( epoch, 'Cross_Entropy', epoch_loss_avg) with torch.no_grad(): if dev_data is not None: model.eval() dev_loss, dev_acc = self.evaluator.evaluate( model, dev_data) self.dev_acc = dev_acc log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( 'Cross_Entropy', dev_loss, dev_acc) log.info(log_msg) early_stopping(dev_loss, model, self.optimizer, epoch, step, self.input_vocab, self.expt_dir) print('early stopping : ', early_stopping.counter) if self.dev_acc > best_accuracy: ######################## dev_acc는 global한 best acc 변수로 best_accuracy = self.dev_acc Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=self.input_vocab).save( self.expt_dir + '/best_accuracy') print('*' * 100) print('SAVE MODEL (BEST DEV ACC)') if test_data is not None: model.eval() test_loss, accuracy = self.evaluator.evaluate( model, test_data) log_msg += ", Test %s: %.4f, Accuracy: %.4f" % ( 'Cross_Entropy', test_loss, accuracy) log.info(log_msg) if early_stopping.early_stop: print( "-------------------Early Stopping---------------------" ) checkpoint = Checkpoint.get_latest_checkpoint( self.expt_dir + '/best_accuracy') checkpoint = Checkpoint.load(checkpoint) model = checkpoint.model ## deep copy for param_tensor in model.state_dict(): print(param_tensor, '\t', model.state_dict()[param_tensor].size()) # config optimizer = checkpoint.optimizer resume_optim = checkpoint.optimizer.optimizer del checkpoint defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) self.optimizer = optimizer loss, accuracy = self.evaluator.evaluate(model, test_data) print('LOAD BEST ACCURACY MODEL ::: loss > {} accuracy{}'. format(loss, accuracy)) break return model
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, test_data=None): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = torch.device('cuda:0') if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.text), device=device, repeat=False, shuffle=True) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 best_accuracy = 0 early_stopping = EarlyStopping(patience = 10, verbose=True) for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for idx in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr(batch, 'text') target_variables = getattr(batch, 'label') loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) # intersections = self.get_intersection(self.pos_lexicons, self.neg_lexicons, epoch) # if intersections is not None: # self.filter_common_word(intersections, self.neg_lexicons, epoch) # self.save_lexicons(self.lexicon_dir +'/neg_epoch:{}'.format(epoch), self.neg_lexicons) # self.save_lexicons(self.lexicon_dir +'/pos_epoch:{}'.format(epoch), self.pos_lexicons) # reset neg/pos/intersection lexcions self.neg_lexicons = [] self.pos_lexicons = [] if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % (epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: model.eval() dev_loss, dev_accuracy = self.evaluator.evaluate(model, dev_data) # self.optimizer.update(dev_loss, epoch) early_stopping(dev_loss, model, self.optimizer, epoch, step, self.input_vocab, self.expt_dir) if dev_accuracy > best_accuracy: best_accuracy = dev_accuracy Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields['text'].vocab).save(self.expt_dir +'/best_accuracy') print(self.expt_dir +'/best_accuracy') test_loss, test_acc = self.evaluator.evaluate(model, test_data) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % (self.loss.name, dev_loss, dev_accuracy) log_msg += ", test %s: %.4f, test Accuracy: %.4f" % (self.loss.name, test_loss, test_acc) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg) if early_stopping.early_stop: print("Early Stopping") break
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr(batch, "src") target_variables = getattr(batch, "tgt") loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) n_iter = step / self.print_every # tensor board self.writer.add_scalar("train_loss", print_loss_avg, n_iter) # log networks parameters for name, param in model.named_parameters(): name = name.replace('.', '/') self.writer.add_histogram( name, param.clone().cpu().data.numpy(), n_iter) self.writer.add_histogram( name + '/grad', param.grad.data.cpu().numpy(), n_iter) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate( model, dev_data, self.writer, n_iter) self.optimizer.update(dev_loss, epoch) model.train(mode=True) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields["src"].vocab, output_vocab=data.fields["tgt"].vocab).save( self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % ( epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg)
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch dataloader = DataLoader(dataset=data, batch_size=self.batch_size, shuffle=True, num_workers=0) steps_per_epoch = len(dataloader) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) model.train(True) for batch in dataloader: step += 1 step_elapsed += 1 input_variables = batch['X'].to(self.device) target_variables = batch['y'].to(self.device) day_ago_data = None #print(batch.keys()) if model.use_day_ago_info: day_ago_data = batch['one_day_ago'].to(self.device) loss = self._train_batch(input_variables, target_variables, model, day_ago_data) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss, print_loss_avg) log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % (epoch, self.loss, epoch_loss_avg) if dev_data is not None: dev_loss, rmse = self.evaluator.evaluate( model, dev_data, self.device) train_loss, _ = self.evaluator.evaluate( model, data, self.device) self.optimizer.update(train_loss, epoch) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Train: %.4f" % ( self.loss, dev_loss, train_loss) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg)
def _train_epoches(self, data, model, teacher_model, n_epochs, start_epoch, start_step, dev_data, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 epoch_loss_total = 0 device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) step_per_epoch = len(batch_iterator) total_steps = step_per_epoch * n_epochs step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() for _ in range((epoch - 1) * step_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_var, input_length = getattr(batch, 'src') target_var = getattr(batch, 'tgt') loss = self._train_batch(input_variable=input_var, input_lengths=input_length, target_variable=target_var, model=model, teacher_model=teacher_model) print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields['src'].vocab, output_vocab=data.fields['tgt'].vocab).save( self.export_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / \ min(step_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % ( epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % ( self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg)
dst_dir = '../result/seq2seq_feature48' if not os.path.exists(dst_dir): os.mkdir(dst_dir) if __name__ == '__main__': seqence_len = 72 output_dim = 3 delay = 36 t2m_checkpoint_path = os.path.join( '../checkpoints/seq72_feature48_global_t2m_best') rh2m_checkpoint_path = os.path.join( '../checkpoints/seq72_feature48_global_rh2m_best') w10m_checkpoint_path = os.path.join( '../checkpoints/seq72_feature48_global_w10m_best') t2m_checkpoint = Checkpoint.load(t2m_checkpoint_path) rh2m_checkpoint = Checkpoint.load(rh2m_checkpoint_path) w10m_checkpoint = Checkpoint.load(w10m_checkpoint_path) t2m_predictor = Predictor(t2m_checkpoint.model.to(device)) rh2m_predictor = Predictor(rh2m_checkpoint.model.to(device)) w10m_predictor = Predictor(w10m_checkpoint.model.to(device)) foretimes = 37 for begin_date, dst_date, end_date in zip(begin_dates, dst_dates, end_dates): submit_csv = None end_date = end_date + ' 12-00-00' for i in range(90001, 90011): df = pd.read_csv(os.path.join(data_dir,