def __init__(self, model_args, model_dir, vocab_file=None, label_file=None, use_cuda=False, wombat_path=None): self.wombat_object = None if wombat_path is not None: self.wombat_object = Wombat(wombat_path) margs = SaveloadHP.load(model_dir + model_args) margs.use_cuda = use_cuda if vocab_file and vocab_file != margs.vocab_file: margs.vocab_file = vocab_file if label_file and label_file != margs.label_file: margs.label_file = label_file if model_dir and model_dir != margs.model_dir: margs.model_dir = model_dir self.tagger = Labeler_model(margs) # labeler_filename = os.path.join(margs.model_dir, margs.labeler_file) # print("Load Model from file: %s" % labeler_filename) # self.tagger.labeler.load_state_dict(torch.load(labeler_filename)) # self.tagger.labeler.to(self.tagger.device) self.tagger.load_parameters(epoch=-1)
def save(tokenizer, tokenize_file): SaveloadHP.save(tokenizer, tokenize_file)
def load(tokenize_file): return SaveloadHP.load(tokenize_file)
def build_data(args): if not args.tl: if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) if args.timestamped_subdir: sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(args.model_dir, sub_folder)): os.mkdir(os.path.join(args.model_dir, sub_folder)) args.model_dir = os.path.join(args.model_dir, sub_folder) args.log_file = os.path.join(args.model_dir, args.log_file) if args.tokenize_type != "bpe": s_paras = [args.wl_th, args.wcutoff] t_paras = [args.wl_th, args.wcutoff] print("INFO: - Build vocabulary...") tokenizer = Tokenizer(s_paras, t_paras) files = [args.train_file] if args.train_file != args.dev_file: files.append(args.dev_file) # Load datasets to build vocabulary data = Tokenizer.load_file(files, task=2) tokenizer.build(datasets=data) sw2i = tokenizer.sw2i tw2i = tokenizer.tw2i print("INFO: - Save vocabulary...") Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab")) else: print("INFO: - Load vocabulary...") tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) sw2i = tokenizer.get_vocab() tw2i = tokenizer.get_vocab() # args.tokenizer = tokenizer # Source language args.swd_pretrained = None args.twd_pretrained = None if len(args.swd_embfile) != 0: scale = np.sqrt(3.0 / args.swd_dim) emb_reader = Embeddings(args.swd_embfile) args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale) if args.twd_embfile == args.swd_embfile: scale = np.sqrt(3.0 / args.twd_dim) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # Target language if len(args.twd_embfile) != 0: scale = np.sqrt(3.0 / args.twd_dim) if args.twd_pretrained is None: emb_reader = Embeddings(args.swd_embfile) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # directly integrate transfer learning if no updating new words SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args else: print("INFO: - Use transfer learning technique") assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file") # load pre-trained argument file from a previous training folder margs = SaveloadHP.load(args.tlargs) # margs.tl = args.tl # margs.log_file = args.log_file # TODO update new vocab and all other new arguments used for new training # 0. Read vocab # 1. Update schema # 2. Update vocab # args.tokenizer = margs.tokenizer # 3. Use all model file directory of previous train args.model_dir = margs.model_dir args.seq2seq_file = margs.seq2seq_file # 4. Keep the remaining current arguments # add a constraint at the loading time that if fail to load any model, just skip it args.swd_pretrained = margs.swd_pretrained args.twd_pretrained = margs.twd_pretrained return args
def train(self): # training result is returned after training to inform calling code of the outcome of training # Values: Matching threshold reached (success): 0, Otherwise: 1 # training_result = 1 train_data, train_numlines = Tokenizer.prepare_iter(self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter(self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter(self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs best_dev = -np.inf if self.args.metric == "bleu" else np.inf if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath(os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save(self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) best_dev = dev_bleu[0] if self.args.metric == "bleu" else dev_loss print("INFO: - Transfer learning performance") print(" - Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- Transfer learning performance") self.appendfile("\t\t- Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) else: with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") print('Dev metric:', self.args.metric) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # set the first model as the best one and save to the main dir # evaluate on developing data dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) dev_metric = dev_bleu[0] if self.args.metric == "bleu" else dev_loss cond = dev_metric > best_dev if self.args.metric == "bleu" else dev_loss < best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) print("INFO: - Save best models") self.save_parameters(epoch=-1) # if dev_string_match >= self.args.matching_threshold: # # TODO: automatically load models to gcp # training_result = 0 # break else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_speed)) return test_bleu[0] epoch_finish, epoch_remain = Timer.timeEst2(epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) # print("INFO: - Save best models") # self.save_parameters(epoch=-1) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_string_match, test_speed)) return test_bleu[0]
def train(self): train_data, train_numlines = Tokenizer.prepare_iter( self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter( self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter( self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs # best_dev = -np.inf if self.args.metric == "f1" else np.inf best_dev = np.inf if self.args.metric == "loss" else -np.inf with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath( os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat( sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save( self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # best_dev = dev_metrics[2] if self.args.metric == "f1" else dev_loss best_dev = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] print("INFO: - Transfer learning performance") print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- Transfer learning performance") self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # evaluate on developing data dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # dev_metric = dev_metrics[2] if self.args.metric == "f1" else dev_loss dev_metric = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] # cond = dev_metric > best_dev if self.args.metric == "f1" else dev_loss < best_dev cond = dev_loss < best_dev if self.args.metric == "loss" else dev_metric > best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) print("INFO: - Save best models") self.save_parameters(epoch=-1) else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print( "SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print( " - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile( "\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics epoch_finish, epoch_remain = Timer.timeEst2( epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics