def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get("overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter) output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter) output_path = os.path.join(model_dir, output_name) datasets_to_test = {"dev": dev_data, "test": test_data, "src_vocab": src_vocab, "trg_vocab": trg_vocab} test(cfg_file, ckpt=ckpt, output_path=output_path, datasets=datasets_to_test)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) train_cfg = cfg["training"] data_cfg = cfg["data"] # set the random seed set_seed(seed=train_cfg.get("random_seed", 42)) # load the data data = load_data(data_cfg) train_data = data["train_data"] dev_data = data["dev_data"] test_data = data["test_data"] vocabs = data["vocabs"] # build an encoder-decoder model model = build_model(cfg["model"], vocabs=vocabs) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, join(trainer.model_dir, "config.yaml")) # log all entries of config log_cfg(cfg, trainer.logger) log_data_info( train_data=train_data, valid_data=dev_data, test_data=test_data, vocabs=vocabs, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs model_dir = train_cfg["model_dir"] for field_name, vocab in vocabs.items(): vocab_file = join(model_dir, field_name + "_vocab.txt") vocab.to_file(vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation (and test, if available) ckpt = join(trainer.model_dir, str(trainer.best_ckpt_iteration) + ".ckpt") output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) print(f'Loading data...') # load the data train_data, dev_data, _, trg_vocab = load_data(data_cfg=cfg["data"], get_test=False) print(f'Building model...') # build an encoder-decoder model model = build_model(cfg["model"], trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) # log_data_info(train_data=train_data, valid_data=dev_data, # test_data=test_data, trg_vocab=trg_vocab, # logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs # trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) # trg_vocab.to_file(trg_vocab_file) print(f'Initiating Training...') # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) return test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger, trg_vocab=trg_vocab)
def train_norm(model, cfg_file: str, skip_test: bool = False) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file :param skip_test: whether a test should be run or not after training """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"], src_lang=cfg["data"].get("src"), trg_lang=cfg["data"].get("trg")) # build an encoder-decoder model #model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) shards_dir = os.path.dirname(cfg["data"]["shard_path"]) if not os.path.exists(shards_dir): os.makedirs(shards_dir) if cfg["data"].get("shard_data", False): assert cfg["data"].get( "n_shards", 0) > 0, "n_shards needs to exist and be at least 1" shard_data(path=cfg["data"]["train"], src_lang=cfg["data"]["src"], tgt_lang=cfg["data"]["trg"], n_shards=cfg["data"]["n_shards"], shard_path=cfg["data"]["shard_path"]) # load the data load_train_whole = True if cfg["data"].get("n_shards", 0) < 1 else False train_data, dev_data, test_data, src_vocab, trg_vocab, src_field, trg_field = load_data( data_cfg=cfg["data"], load_train=load_train_whole) if not load_train_whole: sharded_iterator = ShardedEpochDatasetIterator( n_shards=cfg["data"]["n_shards"], percent_to_sample=cfg["data"].get("percent_to_sample_from_shard", 1.0), data_path=cfg["data"]["train"], shard_path=cfg["data"]["shard_path"], extensions=(cfg["data"]["src"], cfg["data"]["trg"]), fields=(src_field, trg_field), n_epochs=cfg["training"]["epochs"], filter_pred=lambda x: len(vars(x)[ 'src']) <= cfg["data"]["max_sent_length"] and len( vars(x)['trg']) <= cfg["data"]["max_sent_length"]) else: sharded_iterator = None # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) if load_train_whole: log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data, sharded_iterator=sharded_iterator) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def Q_learning(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # config is a dict # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data print("loadding data here") train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # The training data is filtered to include sentences up to `max_sent_length` # on source and target side. # training config: train_config = cfg["training"] shuffle = train_config.get("shuffle", True) batch_size = train_config["batch_size"] mini_BATCH_SIZE = train_config["mini_batch_size"] batch_type = train_config.get("batch_type", "sentence") outer_epochs = train_config.get("outer_epochs", 10) inner_epochs = train_config.get("inner_epochs", 10) TARGET_UPDATE = train_config.get("target_update", 10) Gamma = train_config.get("Gamma", 0.999) use_cuda = train_config["use_cuda"] and torch.cuda.is_available() # validation part config # validation validation_freq = train_config.get("validation_freq", 1000) ckpt_queue = queue.Queue(maxsize=train_config.get("keep_last_ckpts", 5)) eval_batch_size = train_config.get("eval_batch_size", batch_size) level = cfg["data"]["level"] eval_metric = train_config.get("eval_metric", "bleu") n_gpu = torch.cuda.device_count() if use_cuda else 0 eval_batch_type = train_config.get("eval_batch_type", batch_type) # eval options test_config = cfg["testing"] bpe_type = test_config.get("bpe_type", "subword-nmt") sacrebleu = {"remove_whitespace": True, "tokenize": "13a"} max_output_length = train_config.get("max_output_length", None) minimize_metric = True # initialize training statistics stats = TrainStatistics( steps=0, stop=False, total_tokens=0, best_ckpt_iter=0, best_ckpt_score=np.inf if minimize_metric else -np.inf, minimize_metric=minimize_metric) early_stopping_metric = train_config.get("early_stopping_metric", "eval_metric") if early_stopping_metric in ["ppl", "loss"]: stats.minimize_metric = True stats.best_ckpt_score = np.inf elif early_stopping_metric == "eval_metric": if eval_metric in [ "bleu", "chrf", "token_accuracy", "sequence_accuracy" ]: stats.minimize_metric = False stats.best_ckpt_score = -np.inf # eval metric that has to get minimized (not yet implemented) else: stats.minimize_metric = True # data loader(modified from train_and_validate function # Returns a torchtext iterator for a torchtext dataset. # param dataset: torchtext dataset containing src and optionally trg train_iter = make_data_iter(train_data, batch_size=batch_size, batch_type=batch_type, train=True, shuffle=shuffle) # initialize the Replay Memory D with capacity N memory = ReplayMemory(10000) steps_done = 0 # initialize two DQN networks policy_net = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # Q_network target_net = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # Q_hat_network #logger.info(policy_net.src_vocab.stoi) #print("###############trg vocab: ", len(target_net.trg_vocab.stoi)) #print("trg embed: ", target_net.trg_embed.vocab_size) if use_cuda: policy_net.cuda() target_net.cuda() target_net.load_state_dict(policy_net.state_dict()) # Initialize target net Q_hat with weights equal to policy_net target_net.eval() # target_net not update the parameters, test mode # Optimizer optimizer = build_optimizer(config=cfg["training"], parameters=policy_net.parameters()) # Loss function mse_loss = torch.nn.MSELoss() pad_index = policy_net.pad_index # print('!!!'*10, pad_index) cross_entropy_loss = XentLoss(pad_index=pad_index) policy_net.loss_function = cross_entropy_loss # learning rate scheduling scheduler, scheduler_step_at = build_scheduler( config=train_config, scheduler_mode="min" if minimize_metric else "max", optimizer=optimizer, hidden_size=cfg["model"]["encoder"]["hidden_size"]) # model parameters if "load_model" in train_config.keys(): load_model_path = train_config["load_model"] reset_best_ckpt = train_config.get("reset_best_ckpt", False) reset_scheduler = train_config.get("reset_scheduler", False) reset_optimizer = train_config.get("reset_optimizer", False) reset_iter_state = train_config.get("reset_iter_state", False) print('settings', reset_best_ckpt, reset_iter_state, reset_optimizer, reset_scheduler) logger.info("Loading model from %s", load_model_path) model_checkpoint = load_checkpoint(path=load_model_path, use_cuda=use_cuda) # restore model and optimizer parameters policy_net.load_state_dict(model_checkpoint["model_state"]) if not reset_optimizer: optimizer.load_state_dict(model_checkpoint["optimizer_state"]) else: logger.info("Reset optimizer.") if not reset_scheduler: if model_checkpoint["scheduler_state"] is not None and \ scheduler is not None: scheduler.load_state_dict(model_checkpoint["scheduler_state"]) else: logger.info("Reset scheduler.") if not reset_best_ckpt: stats.best_ckpt_score = model_checkpoint["best_ckpt_score"] stats.best_ckpt_iter = model_checkpoint["best_ckpt_iteration"] print('stats.best_ckpt_score', stats.best_ckpt_score) print('stats.best_ckpt_iter', stats.best_ckpt_iter) else: logger.info("Reset tracking of the best checkpoint.") if (not reset_iter_state and model_checkpoint.get( 'train_iter_state', None) is not None): train_iter_state = model_checkpoint["train_iter_state"] # move parameters to cuda target_net.load_state_dict(policy_net.state_dict()) # Initialize target net Q_hat with weights equal to policy_net target_net.eval() if use_cuda: policy_net.cuda() target_net.cuda() for i_episode in range(outer_epochs): # Outer loop # get batch for i, batch in enumerate(iter(train_iter)): # joeynmt training.py 377 # create a Batch object from torchtext batch # ( use class Batch from batch.py) # return the sentences same length (with padding) in one batch batch = Batch(batch, policy_net.pad_index, use_cuda=use_cuda) # we want to get batch.src and batch.trg # the shape of batch.src: (batch_size * length of the sentence) # source here is represented by the word index not word embedding. encoder_output_batch, _, _, _ = policy_net( return_type="encode", src=batch.src, src_length=batch.src_length, src_mask=batch.src_mask, ) trans_output_batch, _ = transformer_greedy( src_mask=batch.src_mask, max_output_length=max_output_length, model=policy_net, encoder_output=encoder_output_batch, steps_done=steps_done, use_cuda=use_cuda) #print('steps_done',steps_done) steps_done += 1 #print('trans_output_batch.shape is:', trans_output_batch.shape) # batch_size * max_translation_sentence_length #print('batch.src', batch.src) #print('batch.trg', batch.trg) print('batch.trg.shape is:', batch.trg.shape) print('trans_output_batch', trans_output_batch) reward_batch = [ ] # Get the reward_batch (Get the bleu score of the sentences in a batch) for i in range(int(batch.src.shape[0])): all_outputs = [(trans_output_batch[i])[1:]] all_ref = [batch.trg[i]] sentence_score = calculate_bleu(model=policy_net, level=level, raw_hypo=all_outputs, raw_ref=all_ref) reward_batch.append(sentence_score) print('reward batch is', reward_batch) reward_batch = torch.tensor(reward_batch, dtype=torch.float) # reward_batch = bleu(hypotheses, references, tokenize="13a") # print('reward_batch.shape', reward_batch.shape) # make prefix and push tuples into memory push_sample_to_memory(model=policy_net, level=level, eos_index=policy_net.eos_index, memory=memory, src_batch=batch.src, trg_batch=batch.trg, trans_output_batch=trans_output_batch, reward_batch=reward_batch, max_output_length=max_output_length) print(memory.capacity, len(memory.memory)) if len(memory.memory) == memory.capacity: # inner loop for t in range(inner_epochs): # Sample mini-batch from the memory transitions = memory.sample(mini_BATCH_SIZE) # transition = [Transition(source=array([]), prefix=array([]), next_word= int, reward= int), # Transition(source=array([]), prefix=array([]), next_word= int, reward= int,...] # Each Transition is what we push into memory for one sentence: memory.push(source, prefix, next_word, reward_batch[i]) mini_batch = Transition(*zip(*transitions)) # merge the same class in transition together # mini_batch = Transition(source=(array([]), array([]),...), prefix=(array([],...), # next_word=array([...]), reward=array([...])) # mini_batch.reward is tuple: length is mini_BATCH_SIZE. #print('mini_batch', mini_batch) #concatenate together into a tensor. words = [] for word in mini_batch.next_word: new_word = word.unsqueeze(0) words.append(new_word) mini_next_word = torch.cat( words) # shape (mini_BATCH_SIZE,) mini_reward = torch.tensor( mini_batch.reward) # shape (mini_BATCH_SIZE,) #print('mini_batch.finish', mini_batch.finish) mini_is_eos = torch.Tensor(mini_batch.finish) #print(mini_is_eos) mini_src_length = [ len(item) for item in mini_batch.source_sentence ] mini_src_length = torch.Tensor(mini_src_length) mini_src = pad_sequence(mini_batch.source_sentence, batch_first=True, padding_value=float(pad_index)) # shape (mini_BATCH_SIZE, max_length_src) length_prefix = [len(item) for item in mini_batch.prefix] mini_prefix_length = torch.Tensor(length_prefix) prefix_list = [] for prefix_ in mini_batch.prefix: prefix_ = torch.from_numpy(prefix_) prefix_list.append(prefix_) mini_prefix = pad_sequence(prefix_list, batch_first=True, padding_value=pad_index) # shape (mini_BATCH_SIZE, max_length_prefix) mini_src_mask = (mini_src != pad_index).unsqueeze(1) mini_trg_mask = (mini_prefix != pad_index).unsqueeze(1) #print('mini_src', mini_src) #print('mini_src_length', mini_src_length) #print('mini_src_mask', mini_src_mask) #print('mini_prefix', mini_prefix) #print('mini_trg_mask', mini_trg_mask) #print('mini_reward', mini_reward) # max_length_src = torch.max(mini_src_length) #max([len(item) for item in mini_batch.source_sentence]) if use_cuda: mini_src = mini_src.cuda() mini_prefix = mini_prefix.cuda() mini_src_mask = mini_src_mask.cuda() mini_src_length = mini_src_length.cuda() mini_trg_mask = mini_trg_mask.cuda() mini_next_word = mini_next_word.cuda() # print(next(policy_net.parameters()).is_cuda) # print(mini_trg_mask.get_device()) # calculate the Q_value logits_Q, _, _, _ = policy_net._encode_decode( src=mini_src, trg_input=mini_prefix, src_mask=mini_src_mask, src_length=mini_src_length, trg_mask= mini_trg_mask # trg_mask = (self.trg_input != pad_index).unsqueeze(1) ) #print('mini_prefix_length', mini_prefix_length) #print('logits_Q.shape', logits_Q.shape) # torch.Size([64, 99, 31716]) #print('logits_Q', logits_Q) # length_prefix = max([len(item) for item in mini_batch.prefix]) # logits_Q shape: batch_size * length of the sentence * total number of words in corpus. logits_Q = logits_Q[range(mini_BATCH_SIZE), mini_prefix_length.long() - 1, :] #print('logits_Q_.shape', logits_Q.shape) #shape(mini_batch_size, num_words) # logits shape: mini_batch_size * total number of words in corpus Q_value = logits_Q[range(mini_BATCH_SIZE), mini_next_word] #print('mini_next_word', mini_next_word) #print("Q_value", Q_value) mini_prefix_add = torch.cat( [mini_prefix, mini_next_word.unsqueeze(1)], dim=1) #print('mini_prefix_add', mini_prefix_add) mini_trg_mask_add = (mini_prefix_add != pad_index).unsqueeze(1) #print('mini_trg_mask_add', mini_trg_mask_add) if use_cuda: mini_prefix_add = mini_prefix_add.cuda() mini_trg_mask_add = mini_trg_mask_add.cuda() logits_Q_hat, _, _, _ = target_net._encode_decode( src=mini_src, trg_input=mini_prefix_add, src_mask=mini_src_mask, src_length=mini_src_length, trg_mask=mini_trg_mask_add) #print('mini_prefix_add.shape', mini_prefix_add.shape) #print('logits_Q_hat.shape', logits_Q_hat.shape) #print('mini_prefix_length.long()', mini_prefix_length.long()) logits_Q_hat = logits_Q_hat[range(mini_BATCH_SIZE), mini_prefix_length.long(), :] Q_hat_value, _ = torch.max(logits_Q_hat, dim=1) #print('Q_hat_value', Q_hat_value) if use_cuda: Q_hat_value = Q_hat_value.cuda() mini_reward = mini_reward.cuda() mini_is_eos = mini_is_eos.cuda() yj = mini_reward.float() + Gamma * Q_hat_value #print('yj', yj) index = mini_is_eos.long() #print('mini_is_eos', mini_is_eos) yj[index] = mini_reward[index] #print('yj', yj) #print('Q_value1', Q_value) yj.detach() # Optimize the model policy_net.zero_grad() # Compute loss loss = mse_loss(yj, Q_value) print('loss', loss) logger.info("step = {}, loss = {}".format( stats.steps, loss.item())) loss.backward() #for param in policy_net.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() stats.steps += 1 #print('step', stats.steps) if stats.steps % TARGET_UPDATE == 0: #print('update the parameters in target_net.') target_net.load_state_dict(policy_net.state_dict()) if stats.steps % validation_freq == 0: # Validation print('Start validation') valid_score, valid_loss, valid_ppl, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ valid_hypotheses_raw, valid_attention_scores = \ validate_on_data( model=policy_net, data=dev_data, batch_size=eval_batch_size, use_cuda=use_cuda, level=level, eval_metric=eval_metric, n_gpu=n_gpu, compute_loss=True, beam_size=1, beam_alpha=-1, batch_type=eval_batch_type, postprocess=True, bpe_type=bpe_type, sacrebleu=sacrebleu, max_output_length=max_output_length ) print( 'validation_loss: {}, validation_score: {}'.format( valid_loss, valid_score)) logger.info(valid_loss) print('average loss: total_loss/n_tokens:', valid_ppl) if early_stopping_metric == "loss": ckpt_score = valid_loss elif early_stopping_metric in ["ppl", "perplexity"]: ckpt_score = valid_ppl else: ckpt_score = valid_score if stats.is_best(ckpt_score): stats.best_ckpt_score = ckpt_score stats.best_ckpt_iter = stats.steps logger.info( 'Hooray! New best validation result [%s]!', early_stopping_metric) if ckpt_queue.maxsize > 0: logger.info("Saving new checkpoint.") # def _save_checkpoint(self) -> None: """ Save the model's current parameters and the training state to a checkpoint. The training state contains the total number of training steps, the total number of training tokens, the best checkpoint score and iteration so far, and optimizer and scheduler states. """ model_path = "{}/{}.ckpt".format( model_dir, stats.steps) model_state_dict = policy_net.module.state_dict() \ if isinstance(policy_net, torch.nn.DataParallel) \ else policy_net.state_dict() state = { "steps": stats.steps, "total_tokens": stats.total_tokens, "best_ckpt_score": stats.best_ckpt_score, "best_ckpt_iteration": stats.best_ckpt_iter, "model_state": model_state_dict, "optimizer_state": optimizer.state_dict(), # "scheduler_state": scheduler.state_dict() if # self.scheduler is not None else None, # 'amp_state': amp.state_dict() if self.fp16 else None } torch.save(state, model_path) if ckpt_queue.full(): to_delete = ckpt_queue.get( ) # delete oldest ckpt try: os.remove(to_delete) except FileNotFoundError: logger.warning( "Wanted to delete old checkpoint %s but " "file does not exist.", to_delete) ckpt_queue.put(model_path) best_path = "{}/best.ckpt".format(model_dir) try: # create/modify symbolic link for best checkpoint symlink_update( "{}.ckpt".format(stats.steps), best_path) except OSError: # overwrite best.ckpt torch.save(state, best_path)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # test the model with the best checkpoint if test_data is not None: # load checkpoint if trainer.best_ckpt_iteration > 0: checkpoint_path = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) else: ## For save_checkpoint by save_freq checkpoint_path = get_latest_checkpoint(trainer.model_dir) try: trainer.init_from_checkpoint(checkpoint_path) except AssertionError: trainer.logger.warning( "Checkpoint %s does not exist. " "Skipping testing.", checkpoint_path) if trainer.best_ckpt_iteration == 0 \ and trainer.best_ckpt_score in [np.inf, -np.inf]: trainer.logger.warning( "It seems like no checkpoint was written, " "since no improvement was obtained over the initial model." ) return # generate hypotheses for test data if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) return_logp = cfg["testing"].get("return_logp", False) else: beam_size = 0 beam_alpha = -1 return_logp = False # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores, log_probs = validate_on_data( data=test_data, batch_size=trainer.batch_size, eval_metric=trainer.eval_metric, level=trainer.level, max_output_length=trainer.max_output_length, model=model, use_cuda=trainer.use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, return_logp=return_logp) if "trg" in test_data.fields: decoding_description = "Greedy decoding" if beam_size == 0 else \ "Beam search decoding with beam size = {} and alpha = {}"\ .format(beam_size, beam_alpha) trainer.logger.info("Test data result: %f %s [%s]", score, trainer.eval_metric, decoding_description) else: trainer.logger.info( "No references given for %s.%s -> no evaluation.", cfg["data"]["test"], cfg["data"]["src"]) output_path_set = "{}/{}.{}".format(trainer.model_dir, "test", cfg["data"]["trg"]) with open(output_path_set, mode="w", encoding="utf-8") as f: for h in hypotheses: f.write("{}\n".format(h)) trainer.logger.info("Test translations saved to: %s", output_path_set) if return_logp: output_path_set_logp = output_path_set + ".logp" with open(output_path_set_logp, mode="w", encoding="utf-8") as f: for l in log_probs: f.write("{}\n".format(l)) trainer.logger.info("Test log probs saved to: %s", output_path_set_logp)
def train_transfer(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["pretraining"].get("random_seed", 42)) # load the data pre_train_data, pre_dev_data, pre_test_data, pre_src_vocab, pre_trg_vocab = load_data( data_cfg=cfg["pretrained_data"]) # build an encoder-decoder model pretrained_model = build_model(cfg["model"], src_vocab=pre_src_vocab, trg_vocab=pre_trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=pretrained_model, config=cfg, training_key="pretraining", name_log="pre_train") # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=pre_train_data, valid_data=pre_dev_data, test_data=pre_test_data, src_vocab=pre_src_vocab, trg_vocab=pre_trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(pretrained_model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["pretraining"]["model_dir"]) pre_src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["pretraining"]["model_dir"]) pre_trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=pre_train_data, valid_data=pre_dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger, key_training="pretraining", key_data="pretrained_data") # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_pretrained_model(cfg["model"], pretrained_model=pretrained_model, pretrained_src_vocab=pre_src_vocab, src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg, training_key="training") # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger, key_training="training", key_data="data")
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) kb_task = bool(cfg["data"].get("kb_task", False)) # load the data train_data, dev_data, test_data,\ src_vocab, trg_vocab,\ train_kb, dev_kb, test_kb,\ train_kb_lookup, dev_kb_lookup, test_kb_lookup,\ train_kb_lengths, dev_kb_lengths, test_kb_lengths,\ train_kb_truvals, dev_kb_truvals, test_kb_truvals,\ trv_vocab, canonizer,\ dev_data_canon, test_data_canon\ = load_data(data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab, trv_vocab=trv_vocab, canonizer=canonizer) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) if kb_task: trv_vocab_file = "{}/trv_vocab.txt".format( cfg["training"]["model_dir"]) trv_vocab.to_file(trv_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data, kb_task=kb_task,\ train_kb=train_kb, train_kb_lkp=train_kb_lookup, train_kb_lens=train_kb_lengths, train_kb_truvals=train_kb_truvals,\ valid_kb=dev_kb, valid_kb_lkp=dev_kb_lookup, valid_kb_lens=dev_kb_lengths, valid_kb_truvals=dev_kb_truvals,\ valid_data_canon=dev_data_canon) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # make logger model_dir = make_model_dir(cfg["training"]["model_dir"], overwrite=cfg["training"].get( "overwrite", False)) _ = make_logger(model_dir, mode="train") # version string returned # TODO: save version number in model checkpoints # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_tasks_list = [] valid_tasks_list = [] src_tasks = cfg["data"].get("src") trg_tasks = cfg["data"].get("trg") for x in range(len(src_tasks)): src_lang = src_tasks[x] trg_lang = trg_tasks[x] train_data, dev_data, _, _, _ = load_data(data_cfg=cfg["data"], src_lang=src_lang, trg_lang=trg_lang) train_tasks_list.append(train_data) valid_tasks_list.append(dev_data) #build vocabulary logger.info("Building vocabulary...") src_max_size = cfg["data"].get("src_voc_limit", sys.maxsize) src_min_freq = cfg["data"].get("src_voc_min_freq", 1) trg_max_size = cfg["data"].get("trg_voc_limit", sys.maxsize) trg_min_freq = cfg["data"].get("trg_voc_min_freq", 1) src_vocab_file = cfg["data"].get("src_vocab", None) trg_vocab_file = cfg["data"].get("trg_vocab", None) src_vocab = build_vocab(field="src", min_freq=src_min_freq, max_size=src_max_size, dataset=train_tasks_list[0], vocab_file=src_vocab_file) trg_vocab = build_vocab(field="trg", min_freq=trg_min_freq, max_size=trg_max_size, dataset=train_tasks_list[0], vocab_file=trg_vocab_file) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=src_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, model_dir + "/config.yaml") # log all entries of config log_cfg(cfg) # log_data_info(train_data=train_data, # valid_data=dev_data, # test_data=test_data, # src_vocab=src_vocab, # trg_vocab=trg_vocab) logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.maml_train_and_validate(train_tasks=train_tasks_list, valid_tasks=valid_tasks_list) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter) output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter) output_path = os.path.join(model_dir, output_name)