def setup_training(model, train_loader, valid_loader, hps): """Does setup before starting training (run_training)""" train_dir = os.path.join(hps.save_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if hps.restore_model != 'None': logger.info("[INFO] Restoring %s for training...", hps.restore_model) bestmodel_file = os.path.join(train_dir, hps.restore_model) loader = ModelLoader() loader.load_pytorch(model, bestmodel_file) else: logger.info("[INFO] Create new model for training...") run_training(model, train_loader, valid_loader, hps) # this is an infinite loop until interrupted
def on_epoch_end(self): epoch_avg_loss = self.train_loss / self.n_steps logger.info( ' | end of epoch {:3d} | time: {:5.2f}s | train loss: {:5.6f}'. format(self.epoch, (time.time() - self.epoch_start_time), epoch_avg_loss)) if self.prev_train_avg_loss < epoch_avg_loss: save_file = os.path.join(self.train_dir, "earlystop.pkl") self.save_model(save_file) else: self.prev_train_avg_loss = epoch_avg_loss self.train_loss = 0.0 # save epoch save_file = os.path.join(self.train_dir, "epoch_%d.pkl" % self.epoch) self.save_model(save_file)
def get_metric(self, reset=True): self.match,self.pred, self.true, self.match_true, self.total = self.match.float(),self.pred.float(), self.true.float(), self.match_true.float(), self.total logger.debug((self.match,self.pred, self.true, self.match_true, self.total)) try: accu = self.match / self.total precision = self.match_true / self.pred recall = self.match_true / self.true F = 2 * precision * recall / (precision + recall) except ZeroDivisionError: F = 0.0 logger.error("[Error] float division by zero") if reset: self.pred, self.true, self.match_true, self.match, self.total = 0, 0, 0, 0, 0 ret = {"accu": accu.cpu(), "p":precision.cpu(), "r":recall.cpu(), "f": F.cpu()} logger.info(ret) return ret
def run_test(model, loader, hps): test_dir = os.path.join( hps.save_root, "test") # make a subdir of the root dir for eval data eval_dir = os.path.join(hps.save_root, "eval") if not os.path.exists(test_dir): os.makedirs(test_dir) if not os.path.exists(eval_dir): logger.exception( "[Error] eval_dir %s doesn't exist. Run in train mode to create it.", eval_dir) raise Exception( "[Error] eval_dir %s doesn't exist. Run in train mode to create it." % (eval_dir)) if hps.test_model == "evalbestmodel": bestmodel_load_path = os.path.join( eval_dir, 'bestmodel.pkl' ) # this is where checkpoints of best models are saved elif hps.test_model == "earlystop": train_dir = os.path.join(hps.save_root, "train") bestmodel_load_path = os.path.join(train_dir, 'earlystop.pkl') else: logger.error( "None of such model! Must be one of evalbestmodel/earlystop") raise ValueError( "None of such model! Must be one of evalbestmodel/earlystop") logger.info("[INFO] Restoring %s for testing...The path is %s", hps.test_model, bestmodel_load_path) modelloader = ModelLoader() modelloader.load_pytorch(model, bestmodel_load_path) if hps.use_pyrouge: logger.info("[INFO] Use PyRougeMetric for testing") tester = Tester(data=loader, model=model, metrics=[ LabelFMetric(pred="prediction"), PyRougeMetric(hps, pred="prediction") ], batch_size=hps.batch_size) else: logger.info("[INFO] Use FastRougeMetric for testing") tester = Tester(data=loader, model=model, metrics=[ LabelFMetric(pred="prediction"), FastRougeMetric(hps, pred="prediction") ], batch_size=hps.batch_size) test_info = tester.test() logger.info(test_info)
def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): logger.info(' | end of valid {:3d} | time: {:5.2f}s | ' .format(self.epoch, (time.time() - self.valid_start_time))) # early stop if not is_better_eval: if self.wait == self.patience: train_dir = os.path.join(self._hps.save_root, "train") save_file = os.path.join(train_dir, "earlystop.pkl") self.save_model(save_file) raise EarlyStopError("Early stopping raised.") else: self.wait += 1 else: self.wait = 0 # lr descent if self._hps.lr_descent: new_lr = max(5e-6, self._hps.lr / (self.epoch + 1)) for param_group in list(optimizer.param_groups): param_group['lr'] = new_lr logger.info("[INFO] The learning rate now is %f", new_lr)
def on_backward_begin(self, loss): """ :param loss: [] :return: """ if not (np.isfinite(loss.data)).numpy(): logger.error("train Loss is not finite. Stopping.") logger.info(loss) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") self.train_loss += loss.data
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.hyps), len(self.refers)) if len(self.hyps) == 0 or len(self.refers) == 0: logger.error("During testing, no hyps or refers is selected!") return if isinstance(self.refers[0], list): logger.info("Multi Reference summaries!") scores_all = pyrouge_score_all_multi(self.hyps, self.refers) else: scores_all = pyrouge_score_all(self.hyps, self.refers) if reset: self.hyps = [] self.refers = [] logger.info(scores_all) return scores_all
def save_model(self, save_file): saver = ModelSaver(save_file) saver.save_pytorch(self.model) logger.info('[INFO] Saving model to %s', save_file)
def main(): parser = argparse.ArgumentParser(description='Summarization Model') # Where to find data parser.add_argument( '--data_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl', help='Path expression to pickle datafiles.') parser.add_argument( '--valid_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl', help='Path expression to pickle valid datafiles.') parser.add_argument('--vocab_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/vocab', help='Path expression to text vocabulary file.') # Important settings parser.add_argument('--mode', choices=['train', 'test'], default='train', help='must be one of train/test') parser.add_argument('--embedding', type=str, default='glove', choices=['word2vec', 'glove', 'elmo', 'bert'], help='must be one of word2vec/glove/elmo/bert') parser.add_argument('--sentence_encoder', type=str, default='transformer', choices=['bilstm', 'deeplstm', 'transformer'], help='must be one of LSTM/Transformer') parser.add_argument('--sentence_decoder', type=str, default='SeqLab', choices=['PN', 'SeqLab'], help='must be one of PN/SeqLab') parser.add_argument( '--restore_model', type=str, default='None', help= 'Restore model for further training. [bestmodel/bestFmodel/earlystop/None]' ) # Where to save output parser.add_argument('--save_root', type=str, default='save/', help='Root directory for all model.') parser.add_argument('--log_root', type=str, default='log/', help='Root directory for all logging.') # Hyperparameters parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--cuda', action='store_true', default=False, help='use cuda') parser.add_argument( '--vocab_size', type=int, default=100000, help= 'Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.' ) parser.add_argument('--n_epochs', type=int, default=20, help='Number of epochs [default: 20]') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 128]') parser.add_argument('--word_embedding', action='store_true', default=True, help='whether to use Word embedding') parser.add_argument('--embedding_path', type=str, default='/remote-home/dqwang/Glove/glove.42B.300d.txt', help='Path expression to external word embedding.') parser.add_argument('--word_emb_dim', type=int, default=300, help='Word embedding size [default: 200]') parser.add_argument( '--embed_train', action='store_true', default=False, help='whether to train Word embedding [default: False]') parser.add_argument('--min_kernel_size', type=int, default=1, help='kernel min length for CNN [default:1]') parser.add_argument('--max_kernel_size', type=int, default=7, help='kernel max length for CNN [default:7]') parser.add_argument('--output_channel', type=int, default=50, help='output channel: repeated times for one kernel') parser.add_argument('--use_orthnormal_init', action='store_true', default=True, help='use orthnormal init for lstm [default: true]') parser.add_argument( '--sent_max_len', type=int, default=100, help='max length of sentences (max source text sentence tokens)') parser.add_argument( '--doc_max_timesteps', type=int, default=50, help='max length of documents (max timesteps of documents)') parser.add_argument('--save_label', action='store_true', default=False, help='require multihead attention') # Training parser.add_argument('--lr', type=float, default=0.0001, help='learning rate') parser.add_argument('--lr_descent', action='store_true', default=False, help='learning rate descent') parser.add_argument('--grad_clip', action='store_true', default=False, help='for gradient clipping') parser.add_argument( '--max_grad_norm', type=float, default=10, help='for gradient clipping max gradient normalization') # test parser.add_argument('-m', type=int, default=3, help='decode summary length') parser.add_argument( '--test_model', type=str, default='evalbestmodel', help= 'choose different model to test [evalbestmodel/evalbestFmodel/trainbestmodel/trainbestFmodel/earlystop]' ) parser.add_argument('--use_pyrouge', action='store_true', default=False, help='use_pyrouge') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu torch.set_printoptions(threshold=50000) # File paths DATA_FILE = args.data_path VALID_FILE = args.valid_path VOCAL_FILE = args.vocab_path LOG_PATH = args.log_root # # train_log setting if not os.path.exists(LOG_PATH): if args.mode == "train": os.makedirs(LOG_PATH) else: raise Exception( "[Error] Logdir %s doesn't exist. Run in train mode to create it." % (LOG_PATH)) nowTime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') log_path = os.path.join(LOG_PATH, args.mode + "_" + nowTime) # logger = _init_logger(path=log_path) # file_handler = logging.FileHandler(log_path) # file_handler.setFormatter(formatter) # logger.addHandler(file_handler) logger.info("Pytorch %s", torch.__version__) # dataset hps = args dbPipe = ExtCNNDMPipe(vocab_size=hps.vocab_size, vocab_path=VOCAL_FILE, sent_max_len=hps.sent_max_len, doc_max_timesteps=hps.doc_max_timesteps) if hps.mode == 'test': hps.recurrent_dropout_prob = 0.0 hps.atten_dropout_prob = 0.0 hps.ffn_dropout_prob = 0.0 logger.info(hps) paths = {"test": DATA_FILE} db = dbPipe.process_from_file(paths) else: paths = {"train": DATA_FILE, "valid": VALID_FILE} db = dbPipe.process_from_file(paths) # embedding if args.embedding == "glove": vocab = db.get_vocab("vocab") embed = torch.nn.Embedding(len(vocab), hps.word_emb_dim) if hps.word_embedding: embed_loader = EmbedLoader() pretrained_weight = embed_loader.load_with_vocab( hps.embedding_path, vocab) # unfound with random init embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) embed.weight.requires_grad = hps.embed_train else: logger.error("[ERROR] embedding To Be Continued!") sys.exit(1) # model if args.sentence_encoder == "transformer" and args.sentence_decoder == "SeqLab": model_param = json.load(open("config/transformer.config", "rb")) hps.__dict__.update(model_param) model = TransformerModel(hps, embed) elif args.sentence_encoder == "deeplstm" and args.sentence_decoder == "SeqLab": model_param = json.load(open("config/deeplstm.config", "rb")) hps.__dict__.update(model_param) model = SummarizationModel(hps, embed) else: logger.error("[ERROR] Model To Be Continued!") sys.exit(1) if hps.cuda: model = model.cuda() logger.info("[INFO] Use cuda") logger.info(hps) if hps.mode == 'train': db.get_dataset("valid").set_target("text", "summary") setup_training(model, db.get_dataset("train"), db.get_dataset("valid"), hps) elif hps.mode == 'test': logger.info("[INFO] Decoding...") db.get_dataset("test").set_target("text", "summary") run_test(model, db.get_dataset("test"), hps, limited=hps.limited) else: logger.error("The 'mode' flag must be one of train/eval/test") raise ValueError("The 'mode' flag must be one of train/eval/test")