def main(opt_): if opt_.pretrained: net, dictionary = load_model(opt_.pretrained, opt_) net.opt.dataset_name = opt_.dataset_name net.opt.reddit_folder = opt_.reddit_folder net.opt.reactonly = opt_.reactonly net.opt.max_hist_len = opt_.max_hist_len env = TrainEnvironment(net.opt, dictionary) if opt_.cuda: net = torch.nn.DataParallel(net.cuda()) valid_data = env.build_valid_dataloader(False) test_data = env.build_valid_dataloader(False, test=True) with torch.no_grad(): logging.info("Validating on the valid set -unshuffled") validate( 0, net, valid_data, is_test=False, nb_candidates=opt_.hits_at_nb_cands, is_shuffled=False, ) logging.info("Validating on the hidden test set -unshuffled") validate( 0, net, test_data, is_test=True, nb_candidates=opt_.hits_at_nb_cands, is_shuffled=False, ) valid_data = env.build_valid_dataloader(True) test_data = env.build_valid_dataloader(True, test=True) with torch.no_grad(): logging.info("Validating on the valid set -shuffle") validate( 0, net, valid_data, is_test=False, nb_candidates=opt_.hits_at_nb_cands, is_shuffled=True, ) logging.info("Validating on the hidden test set -shuffle") validate( 0, net, test_data, is_test=True, nb_candidates=opt_.hits_at_nb_cands, is_shuffled=True, ) else: train_model(opt_)
def train_model(opt_): env = TrainEnvironment(opt_) dictionary = env.dict if opt_.load_checkpoint: net, dictionary = load_model(opt_.load_checkpoint, opt_) env = TrainEnvironment(opt_, dictionary) env.dict = dictionary else: net = create_model(opt_, dictionary["words"]) if opt_.embeddings and opt_.embeddings != "None": load_embeddings(opt_, dictionary["words"], net) paramnum = 0 trainable = 0 for name, parameter in net.named_parameters(): if parameter.requires_grad: trainable += parameter.numel() paramnum += parameter.numel() print("TRAINABLE", paramnum, trainable) if opt_.cuda: net = torch.nn.DataParallel(net) net = net.cuda() if opt_.optimizer == "adamax": lr = opt_.learning_rate or 0.002 named_params_to_optimize = filter(lambda p: p[1].requires_grad, net.named_parameters()) params_to_optimize = (p[1] for p in named_params_to_optimize) optimizer = optim.Adamax(params_to_optimize, lr=lr) if opt_.epoch_start != 0: saved_params = torch.load( opt_.load_checkpoint, map_location=lambda storage, loc: storage) optimizer.load_state_dict(saved_params["optim_dict"]) else: lr = opt_.learning_rate or 0.01 optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=lr) start_time = time.time() best_loss = float("+inf") test_data_shuffled = env.build_valid_dataloader(True) test_data_not_shuffled = env.build_valid_dataloader(False) with torch.no_grad(): validate( 0, net, test_data_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="shuffled", ) train_data = None for epoch in range(opt_.epoch_start, opt_.num_epochs): if train_data is None or opt_.dataset_name == "reddit": train_data = env.build_train_dataloader(epoch) train(epoch, start_time, net, optimizer, opt_, train_data) with torch.no_grad(): # We compute the loss both for shuffled and not shuffled case. # however, the loss that determines if the model is better is the # same as the one used for training. loss_shuffled = validate( epoch, net, test_data_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="shuffled", ) loss_not_shuffled = validate( epoch, net, test_data_not_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="not-shuffled", ) if opt_.no_shuffle: loss = loss_not_shuffled else: loss = loss_shuffled if loss < best_loss: best_loss = loss best_loss_epoch = epoch logging.info( f"New best loss, saving model to {opt_.model_file}") save_model(opt_.model_file, net, dictionary, optimizer) # Stop if it's been too many epochs since the loss has decreased if opt_.stop_crit_num_epochs != -1: if epoch - best_loss_epoch >= opt_.stop_crit_num_epochs: break return net, dictionary
no_shuffle=False, normalize_emb=False, normalize_sent_emb=False, num_epochs=10000, optimizer='adamax', pretrained=None, random_seed=92179, reactonly=False, reddit_folder='reddit', rm_long_contexts=False, rm_long_sent=False, stop_crit_num_epochs=-1, transformer_dim=512, transformer_dropout=0, transformer_n_heads=8) env = TrainEnvironment(opt) # Making dictionary dictionary = env.dict print("Length of dictionary = " + str(len(dictionary["words"]))) print("Embedding Size = " + str(opt.embeddings_size)) # # env.temp_dict is passed to EmpDataset as dictionary. opt.transformer_dim = 300 opt.transformer_n_heads = 6 opt.model = "transformer" net = create_model(opt, dictionary["words"]) # Initializes TransformerAdapter print(net) # net contains embeddings (dim = len(dictionary),embeddings_size=300) # ctx_transformer # cand_transformer