def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get("overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter) output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter) output_path = os.path.join(model_dir, output_name) datasets_to_test = {"dev": dev_data, "test": test_data, "src_vocab": src_vocab, "trg_vocab": trg_vocab} test(cfg_file, ckpt=ckpt, output_path=output_path, datasets=datasets_to_test)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) train_cfg = cfg["training"] data_cfg = cfg["data"] # set the random seed set_seed(seed=train_cfg.get("random_seed", 42)) # load the data data = load_data(data_cfg) train_data = data["train_data"] dev_data = data["dev_data"] test_data = data["test_data"] vocabs = data["vocabs"] # build an encoder-decoder model model = build_model(cfg["model"], vocabs=vocabs) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, join(trainer.model_dir, "config.yaml")) # log all entries of config log_cfg(cfg, trainer.logger) log_data_info( train_data=train_data, valid_data=dev_data, test_data=test_data, vocabs=vocabs, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs model_dir = train_cfg["model_dir"] for field_name, vocab in vocabs.items(): vocab_file = join(model_dir, field_name + "_vocab.txt") vocab.to_file(vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation (and test, if available) ckpt = join(trainer.model_dir, str(trainer.best_ckpt_iteration) + ".ckpt") output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train_norm(model, cfg_file: str, skip_test: bool = False) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file :param skip_test: whether a test should be run or not after training """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"], src_lang=cfg["data"].get("src"), trg_lang=cfg["data"].get("trg")) # build an encoder-decoder model #model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train(cfg_file): """ Main training function. After training, also test on test data if given. :param cfg_file: :return: """ cfg = load_config(cfg_file) # set the random seed # torch.backends.cudnn.deterministic = True seed = cfg["training"].get("random_seed", 42) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(cfg=cfg) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # print config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) model.log_parameters_list(logging_function=trainer.logger.info) logging.info(model) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) if test_data is not None: trainer.load_checkpoint("{}/{}.ckpt".format( trainer.model_dir, trainer.best_ckpt_iteration)) # test model if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 0 beam_alpha = -1 # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( data=test_data, batch_size=trainer.batch_size, eval_metric=trainer.eval_metric, level=trainer.level, max_output_length=trainer.max_output_length, model=model, use_cuda=trainer.use_cuda, criterion=None, beam_size=beam_size, beam_alpha=beam_alpha) if "trg" in test_data.fields: decoding_description = "Greedy decoding" if beam_size == 0 else \ "Beam search decoding with beam size = {} and alpha = {}"\ .format(beam_size, beam_alpha) trainer.logger.info("Test data result: %f %s [%s]", score, trainer.eval_metric, decoding_description) else: trainer.logger.info( "No references given for %s.%s -> no evaluation.", cfg["data"]["test"], cfg["data"]["src"]) output_path_set = "{}/{}.{}".format(trainer.model_dir, "test", cfg["data"]["trg"]) with open(output_path_set, mode="w", encoding="utf-8") as f: for h in hypotheses: f.write(h + "\n") trainer.logger.info("Test translations saved to: %s", output_path_set)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) shards_dir = os.path.dirname(cfg["data"]["shard_path"]) if not os.path.exists(shards_dir): os.makedirs(shards_dir) if cfg["data"].get("shard_data", False): assert cfg["data"].get( "n_shards", 0) > 0, "n_shards needs to exist and be at least 1" shard_data(path=cfg["data"]["train"], src_lang=cfg["data"]["src"], tgt_lang=cfg["data"]["trg"], n_shards=cfg["data"]["n_shards"], shard_path=cfg["data"]["shard_path"]) # load the data load_train_whole = True if cfg["data"].get("n_shards", 0) < 1 else False train_data, dev_data, test_data, src_vocab, trg_vocab, src_field, trg_field = load_data( data_cfg=cfg["data"], load_train=load_train_whole) if not load_train_whole: sharded_iterator = ShardedEpochDatasetIterator( n_shards=cfg["data"]["n_shards"], percent_to_sample=cfg["data"].get("percent_to_sample_from_shard", 1.0), data_path=cfg["data"]["train"], shard_path=cfg["data"]["shard_path"], extensions=(cfg["data"]["src"], cfg["data"]["trg"]), fields=(src_field, trg_field), n_epochs=cfg["training"]["epochs"], filter_pred=lambda x: len(vars(x)[ 'src']) <= cfg["data"]["max_sent_length"] and len( vars(x)['trg']) <= cfg["data"]["max_sent_length"]) else: sharded_iterator = None # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) if load_train_whole: log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data, sharded_iterator=sharded_iterator) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # test the model with the best checkpoint if test_data is not None: # load checkpoint if trainer.best_ckpt_iteration > 0: checkpoint_path = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) else: ## For save_checkpoint by save_freq checkpoint_path = get_latest_checkpoint(trainer.model_dir) try: trainer.init_from_checkpoint(checkpoint_path) except AssertionError: trainer.logger.warning( "Checkpoint %s does not exist. " "Skipping testing.", checkpoint_path) if trainer.best_ckpt_iteration == 0 \ and trainer.best_ckpt_score in [np.inf, -np.inf]: trainer.logger.warning( "It seems like no checkpoint was written, " "since no improvement was obtained over the initial model." ) return # generate hypotheses for test data if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) return_logp = cfg["testing"].get("return_logp", False) else: beam_size = 0 beam_alpha = -1 return_logp = False # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores, log_probs = validate_on_data( data=test_data, batch_size=trainer.batch_size, eval_metric=trainer.eval_metric, level=trainer.level, max_output_length=trainer.max_output_length, model=model, use_cuda=trainer.use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, return_logp=return_logp) if "trg" in test_data.fields: decoding_description = "Greedy decoding" if beam_size == 0 else \ "Beam search decoding with beam size = {} and alpha = {}"\ .format(beam_size, beam_alpha) trainer.logger.info("Test data result: %f %s [%s]", score, trainer.eval_metric, decoding_description) else: trainer.logger.info( "No references given for %s.%s -> no evaluation.", cfg["data"]["test"], cfg["data"]["src"]) output_path_set = "{}/{}.{}".format(trainer.model_dir, "test", cfg["data"]["trg"]) with open(output_path_set, mode="w", encoding="utf-8") as f: for h in hypotheses: f.write("{}\n".format(h)) trainer.logger.info("Test translations saved to: %s", output_path_set) if return_logp: output_path_set_logp = output_path_set + ".logp" with open(output_path_set_logp, mode="w", encoding="utf-8") as f: for l in log_probs: f.write("{}\n".format(l)) trainer.logger.info("Test log probs saved to: %s", output_path_set_logp)