def train_transfer(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["pretraining"].get("random_seed", 42)) # load the data pre_train_data, pre_dev_data, pre_test_data, pre_src_vocab, pre_trg_vocab = load_data( data_cfg=cfg["pretrained_data"]) # build an encoder-decoder model pretrained_model = build_model(cfg["model"], src_vocab=pre_src_vocab, trg_vocab=pre_trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=pretrained_model, config=cfg, training_key="pretraining", name_log="pre_train") # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=pre_train_data, valid_data=pre_dev_data, test_data=pre_test_data, src_vocab=pre_src_vocab, trg_vocab=pre_trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(pretrained_model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["pretraining"]["model_dir"]) pre_src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["pretraining"]["model_dir"]) pre_trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=pre_train_data, valid_data=pre_dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger, key_training="pretraining", key_data="pretrained_data") # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_pretrained_model(cfg["model"], pretrained_model=pretrained_model, pretrained_src_vocab=pre_src_vocab, src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg, training_key="training") # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger, key_training="training", key_data="data")
def train(cfg_file: str, skip_test: bool = False) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file :param skip_test: whether a test should be run or not after training """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) if not skip_test: # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter) output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter) output_path = os.path.join(model_dir, output_name) datasets_to_test = { "dev": dev_data, "test": test_data, "src_vocab": src_vocab, "trg_vocab": trg_vocab } test(cfg_file, ckpt=ckpt, output_path=output_path, datasets=datasets_to_test) else: logger.info("Skipping test after training")
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_tasks_list = [] valid_tasks_list = [] src_tasks = cfg["data"].get("src") trg_tasks = cfg["data"].get("trg") for x in range(len(src_tasks)): src_lang = src_tasks[x] trg_lang = trg_tasks[x] train_data, dev_data, _, _, _ = load_data(data_cfg=cfg["data"], src_lang=src_lang, trg_lang=trg_lang) train_tasks_list.append(train_data) valid_tasks_list.append(dev_data) #build vocabulary logger.info("Building vocabulary...") src_max_size = cfg["data"].get("src_voc_limit", sys.maxsize) src_min_freq = cfg["data"].get("src_voc_min_freq", 1) trg_max_size = cfg["data"].get("trg_voc_limit", sys.maxsize) trg_min_freq = cfg["data"].get("trg_voc_min_freq", 1) src_vocab_file = cfg["data"].get("src_vocab", None) trg_vocab_file = cfg["data"].get("trg_vocab", None) src_vocab = build_vocab(field="src", min_freq=src_min_freq, max_size=src_max_size, dataset=train_tasks_list[0], vocab_file=src_vocab_file) trg_vocab = build_vocab(field="trg", min_freq=trg_min_freq, max_size=trg_max_size, dataset=train_tasks_list[0], vocab_file=trg_vocab_file) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=src_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) # log_data_info(train_data=train_data, # valid_data=dev_data, # test_data=test_data, # src_vocab=src_vocab, # trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.maml_train_and_validate(train_tasks=train_tasks_list, valid_tasks=valid_tasks_list) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter) output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter) output_path = os.path.join(model_dir, output_name)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) kb_task = bool(cfg["data"].get("kb_task", False)) # load the data train_data, dev_data, test_data,\ src_vocab, trg_vocab,\ train_kb, dev_kb, test_kb,\ train_kb_lookup, dev_kb_lookup, test_kb_lookup,\ train_kb_lengths, dev_kb_lengths, test_kb_lengths,\ train_kb_truvals, dev_kb_truvals, test_kb_truvals,\ trv_vocab, canonizer,\ dev_data_canon, test_data_canon\ = load_data(data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab, trv_vocab=trv_vocab, canonizer=canonizer) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) if kb_task: trv_vocab_file = "{}/trv_vocab.txt".format( cfg["training"]["model_dir"]) trv_vocab.to_file(trv_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data, kb_task=kb_task,\ train_kb=train_kb, train_kb_lkp=train_kb_lookup, train_kb_lens=train_kb_lengths, train_kb_truvals=train_kb_truvals,\ valid_kb=dev_kb, valid_kb_lkp=dev_kb_lookup, valid_kb_lens=dev_kb_lengths, valid_kb_truvals=dev_kb_truvals,\ valid_data_canon=dev_data_canon) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train(cfg_file): """ Main training function. After training, also test on test data if given. :param cfg_file: :return: """ cfg = load_config(cfg_file) # set the random seed # torch.backends.cudnn.deterministic = True seed = cfg["training"].get("random_seed", 42) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(cfg=cfg) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # print config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) model.log_parameters_list(logging_function=trainer.logger.info) logging.info(model) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) if test_data is not None: trainer.load_checkpoint("{}/{}.ckpt".format( trainer.model_dir, trainer.best_ckpt_iteration)) # test model if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 0 beam_alpha = -1 score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( data=test_data, batch_size=trainer.batch_size, eval_metric=trainer.eval_metric, level=trainer.level, max_output_length=trainer.max_output_length, model=model, use_cuda=trainer.use_cuda, criterion=None, beam_size=beam_size, beam_alpha=beam_alpha) if "trg" in test_data.fields: decoding_description = "Greedy decoding" if beam_size == 0 else \ "Beam search decoding with beam size = {} and alpha = {}"\ .format(beam_size, beam_alpha) trainer.logger.info("{:4s}: {} {} [{}]".format( "Test data result", score, trainer.eval_metric, decoding_description)) else: trainer.logger.info( "No references given for {}.{} -> no evaluation.".format( cfg["data"]["test"], cfg["data"]["src"])) output_path_set = "{}/{}.{}".format(trainer.model_dir, "test", cfg["data"]["trg"]) with open(output_path_set, mode="w", encoding="utf-8") as f: for h in hypotheses: f.write(h + "\n") trainer.logger.info( "Test translations saved to: {}".format(output_path_set))