def generate(args): """ Use the trained model for decoding Args args (argparse.ArgumentParser) """ if args.cuda and torch.cuda.is_available(): device = 0 use_cuda = True elif args.cuda and not torch.cuda.is_available(): print("You do not have CUDA, turning cuda off") device = -1 use_cuda = False else: device = -1 use_cuda = False #Load the vocab vocab = du.load_vocab(args.vocab) eos_id = vocab.stoi[EOS_TOK] pad_id = vocab.stoi[PAD_TOK] if args.ranking: # default is HARD one, the 'Inverse Narrative Cloze' in the paper dataset = du.NarrativeClozeDataset(args.valid_data, vocab, src_seq_length=MAX_EVAL_SEQ_LEN, min_seq_length=MIN_EVAL_SEQ_LEN, LM=False) # Batch size during decoding is set to 1 batches = BatchIter(dataset, 1, sort_key=lambda x: len(x.actual), train=False, device=-1) else: dataset = du.SentenceDataset(args.valid_data, vocab, src_seq_length=MAX_EVAL_SEQ_LEN, min_seq_length=MIN_EVAL_SEQ_LEN, add_eos=False) #put in filter pred later # Batch size during decoding is set to 1 batches = BatchIter(dataset, args.batch_size, sort_key=lambda x: len(x.text), train=False, device=-1) data_len = len(dataset) #Create the model with open(args.load, 'rb') as fi: if not use_cuda: model = torch.load(fi, map_location=lambda storage, loc: storage) else: model = torch.load(fi, map_location=torch.device('cuda')) if not hasattr(model.latent_root, 'nohier'): model.latent_root.set_nohier(args.nohier) #for backwards compatibility model.decoder.eval() model.set_use_cuda(use_cuda) #For reconstruction if args.perplexity: loss = calc_perplexity(args, model, batches, vocab, data_len) print("Loss = {}".format(loss)) elif args.schema: generate_from_seed(args, model, batches, vocab, data_len) elif args.ranking: do_ranking(args, model, batches, vocab, data_len, use_cuda) else: # sample_outputs(model, vocab) reconstruct(args, model, batches, vocab)
def classic_train(args): """ Train the model in the ol' fashioned way, just like grandma used to Args args (argparse.ArgumentParser) """ if args.cuda and torch.cuda.is_available(): print("Using cuda") use_cuda = True elif args.cuda and not torch.cuda.is_available(): print("You do not have CUDA, turning cuda off") use_cuda = False else: use_cuda = False #Load the data print("\nLoading Vocab") vocab = du.load_vocab(args.vocab) print("Vocab Loaded, Size {}".format(len(vocab.stoi.keys()))) if args.use_pretrained: pretrained = GloVe(name='6B', dim=args.emb_size, unk_init=torch.Tensor.normal_) vocab.load_vectors(pretrained) print("Vectors Loaded") print("Loading Dataset") dataset = du.SentenceDataset(args.train_data, vocab, args.src_seq_length, add_eos=False) #put in filter pred later print("Finished Loading Dataset {} examples".format(len(dataset))) batches = BatchIter(dataset, args.batch_size, sort_key=lambda x: len(x.text), train=True, sort_within_batch=True, device=-1) data_len = len(dataset) if args.load_model: print("Loading the Model") model = torch.load(args.load_model) else: print("Creating the Model") bidir_mod = 2 if args.bidir else 1 latents = example_tree( args.num_latent_values, (bidir_mod * args.enc_hid_size, args.latent_dim), use_cuda=use_cuda) #assume bidirectional hidsize = (args.enc_hid_size, args.dec_hid_size) model = DAVAE(args.emb_size, hidsize, vocab, latents, layers=args.nlayers, use_cuda=use_cuda, pretrained=args.use_pretrained, dropout=args.dropout) #create the optimizer if args.load_opt: print("Loading the optimizer state") optimizer = torch.load(args.load_opt) else: print("Creating the optimizer anew") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) start_time = time.time() #start of epoch 1 curr_epoch = 1 valid_loss = [0.0] for iteration, bl in enumerate( batches ): #this will continue on forever (shuffling every epoch) till epochs finished batch, batch_lens = bl.text target, target_lens = bl.target if use_cuda: batch = Variable(batch.cuda()) else: batch = Variable(batch) model.zero_grad() latent_values, latent_root, diff, dec_outputs = model( batch, batch_lens) # train set to True so returns total loss loss, _ = monolithic_compute_loss(iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args) # backward propagation loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # Optimize optimizer.step() # End of an epoch - run validation if ((args.batch_size * iteration) % data_len == 0 or iteration % args.validate_after == 0) and iteration != 0: print("\nFinished Training Epoch/iteration {}/{}".format( curr_epoch, iteration)) # do validation print("Loading Validation Dataset.") val_dataset = du.SentenceDataset( args.valid_data, vocab, args.src_seq_length, add_eos=False) #put in filter pred later print("Finished Loading Validation Dataset {} examples.".format( len(val_dataset))) val_batches = BatchIter(val_dataset, args.batch_size, sort_key=lambda x: len(x.text), train=False, sort_within_batch=True, device=-1) valid_loss = 0.0 for v_iteration, bl in enumerate(val_batches): batch, batch_lens = bl.text target, target_lens = bl.target batch_lens = batch_lens.cpu() if use_cuda: batch = Variable(batch.cuda(), volatile=True) else: batch = Variable(batch, volatile=True) latent_values, latent_root, diff, dec_outputs = model( batch, batch_lens) # train set to False so returns only CE loss loss, ce_loss = monolithic_compute_loss(iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args, train=False) valid_loss = valid_loss + ce_loss.data.clone() valid_loss = valid_loss / (v_iteration + 1) print("**Validation loss {:.2f}.**\n".format(valid_loss[0])) # Check max epochs and break if (args.batch_size * iteration) % data_len == 0: curr_epoch += 1 if curr_epoch > args.epochs: print("Max epoch {}-{} reached. Exiting.\n".format( curr_epoch, args.epochs)) break # Save the checkpoint if iteration % args.save_after == 0 and iteration != 0: print("Saving checkpoint for epoch {} at {}.\n".format( curr_epoch, args.save_model)) # curr_epoch and validation stats appended to the model name torch.save( model, "{}_{}_{}_.epoch_{}.loss_{:.2f}.pt".format( args.save_model, args.commit_c, args.commit2_c, curr_epoch, float(valid_loss[0]))) torch.save( optimizer, "{}.{}.epoch_{}.loss_{:.2f}.pt".format(args.save_model, "optimizer", curr_epoch, float(valid_loss[0])))
def classic_train(args, args_dict, args_info): """ Train the model in the ol' fashioned way, just like grandma used to Args args (argparse.ArgumentParser) """ if args.cuda and torch.cuda.is_available(): print("Using cuda") use_cuda = True elif args.cuda and not torch.cuda.is_available(): print("You do not have CUDA, turning cuda off") use_cuda = False else: use_cuda = False #Load the data print("\nLoading Vocab") print('args.vocab: ', args.vocab) vocab, verb_max_idx = du.load_vocab(args.vocab) print("Vocab Loaded, Size {}".format(len(vocab.stoi.keys()))) print(vocab.itos[:40]) args_dict["vocab"] = len(vocab.stoi.keys()) vocab2 = du.load_vocab(args.frame_vocab_address, is_Frame=True) print(vocab2.itos[:40]) print("Frames-Vocab Loaded, Size {}".format(len(vocab2.stoi.keys()))) total_frames = len(vocab2.stoi.keys()) args.total_frames = total_frames args.num_latent_values = args.total_frames print('total frames: ', args.total_frames) experiment_name = 'SSDVAE_wotemp_{}_eps_{}_num_{}_seed_{}'.format( 'chain_event', str(args_dict['obsv_prob']), str(args_dict['exp_num']), str(args_dict['seed'])) experiment_name = '{}_eps_{}_num_{}_seed_{}'.format( 'chain_event', str(args_dict['obsv_prob']), str(args_dict['exp_num']), str(args_dict['seed'])) if args.use_pretrained: pretrained = GloVe(name='6B', dim=args.emb_size, unk_init=torch.Tensor.normal_) vocab.load_vectors(pretrained) print("Vectors Loaded") print("Loading Dataset") dataset = du.SentenceDataset(path=args.train_data, path2=args.train_frames, vocab=vocab, vocab2=vocab2, num_clauses=args.num_clauses, add_eos=False, is_ref=True, obsv_prob=args.obsv_prob) print("Finished Loading Dataset {} examples".format(len(dataset))) batches = BatchIter(dataset, args.batch_size, sort_key=lambda x: len(x.text), train=True, sort_within_batch=True, device=-1) data_len = len(dataset) if args.load_model: print("Loading the Model") model = torch.load(args.load_model) else: print("Creating the Model") bidir_mod = 2 if args.bidir else 1 latents = example_tree( args.num_latent_values, (bidir_mod * args.enc_hid_size, args.latent_dim), frame_max=args.total_frames, padding_idx=vocab2.stoi['<pad>'], use_cuda=use_cuda, nohier_mode=args.nohier) #assume bidirectional hidsize = (args.enc_hid_size, args.dec_hid_size) model = SSDVAE(args.emb_size, hidsize, vocab, latents, layers=args.nlayers, use_cuda=use_cuda, pretrained=args.use_pretrained, dropout=args.dropout, frame_max=args.total_frames, latent_dim=args.latent_dim, verb_max_idx=verb_max_idx) #create the optimizer if args.load_opt: print("Loading the optimizer state") optimizer = torch.load(args.load_opt) else: print("Creating the optimizer anew") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) start_time = time.time() #start of epoch 1 curr_epoch = 1 valid_loss = [0.0] min_ppl = 1e10 print("Loading Validation Dataset.") val_dataset = du.SentenceDataset(path=args.valid_data, path2=args.valid_frames, vocab=vocab, vocab2=vocab2, num_clauses=args.num_clauses, add_eos=False, is_ref=True, obsv_prob=0.0, print_valid=True) print("Finished Loading Validation Dataset {} examples.".format( len(val_dataset))) val_batches = BatchIter(val_dataset, args.batch_size, sort_key=lambda x: len(x.text), train=False, sort_within_batch=True, device=-1) for idx, item in enumerate(val_batches): if idx == 0: break token_rev = [vocab.itos[int(v.numpy())] for v in item.target[0][-1]] frame_rev = [vocab2.itos[int(v.numpy())] for v in item.frame[0][-1]] ref_frame = [vocab2.itos[int(v.numpy())] for v in item.ref[0][-1]] print('token_rev:', token_rev, len(token_rev), "lengths: ", item.target[1][-1]) print('frame_rev:', frame_rev, len(frame_rev), "lengths: ", item.frame[1][-1]) print('ref_frame:', ref_frame, len(ref_frame), "lengths: ", item.ref[1][-1]) print('-' * 50) print('Model_named_params:{}'.format(model.named_parameters())) for iteration, bl in enumerate( batches ): #this will continue on forever (shuffling every epoch) till epochs finished batch, batch_lens = bl.text f_vals, f_vals_lens = bl.frame target, target_lens = bl.target f_ref, _ = bl.ref if use_cuda: batch = Variable(batch.cuda()) f_vals = Variable(f_vals.cuda()) else: batch = Variable(batch) f_vals = Variable(f_vals) model.zero_grad() latent_values, latent_root, diff, dec_outputs = model(batch, batch_lens, f_vals=f_vals) topics_dict, real_sentence, next_frames_dict, word_to_frame = show_inference( model, batch, vocab, vocab2, f_vals, f_ref, args) loss, _ = monolithic_compute_loss(iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args, topics_dict=topics_dict, real_sentence=real_sentence, next_frames_dict=next_frames_dict, word_to_frame=word_to_frame, train=True, show=True) # backward propagation loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # Optimize optimizer.step() # End of an epoch - run validation if iteration % 10 == 0: print("\nFinished Training Epoch/iteration {}/{}".format( curr_epoch, iteration)) # do validation valid_logprobs = 0.0 valid_lengths = 0.0 valid_loss = 0.0 with torch.no_grad(): for v_iteration, bl in enumerate(val_batches): batch, batch_lens = bl.text f_vals, f_vals_lens = bl.frame target, target_lens = bl.target f_ref, _ = bl.ref batch_lens = batch_lens.cpu() if use_cuda: batch = Variable(batch.cuda()) f_vals = Variable(f_vals.cuda()) else: batch = Variable(batch) f_vals = Variable(f_vals) latent_values, latent_root, diff, dec_outputs = model( batch, batch_lens, f_vals=f_vals) topics_dict, real_sentence, next_frames_dict, word_to_frame = show_inference( model, batch, vocab, vocab2, f_vals, f_ref, args) loss, ce_loss = monolithic_compute_loss( iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args, topics_dict=topics_dict, real_sentence=real_sentence, next_frames_dict=next_frames_dict, word_to_frame=word_to_frame, train=False, show=False) valid_loss = valid_loss + ce_loss.data.clone() valid_logprobs += ce_loss.data.clone().cpu().numpy( ) * target_lens.sum().cpu().data.numpy() valid_lengths += target_lens.sum().cpu().data.numpy() # print("valid_lengths: ",valid_lengths[0]) nll = valid_logprobs / valid_lengths ppl = np.exp(nll) valid_loss = valid_loss / (v_iteration + 1) print("**Validation loss {:.2f}.**\n".format(valid_loss[0])) print("**Validation NLL {:.2f}.**\n".format(nll)) print("**Validation PPL {:.2f}.**\n".format(ppl)) args_dict_wandb = { "val_nll": nll, "val_ppl": ppl, "valid_loss": valid_loss } if ppl < min_ppl: min_ppl = ppl args_dict["min_ppl"] = min_ppl dir_path = os.path.dirname(os.path.realpath(__file__)) save_file = "".join([ "_" + str(key) + "_" + str(value) for key, value in args_dict.items() if key != "min_ppl" ]) args_to_md(model="chain", args_dict=args_dict) model_path = os.path.join(dir_path + "/saved_models/chain_" + save_file + ".pt") torch.save(model, model_path) config_path = os.path.join(dir_path + "/saved_configs/chain_" + save_file + ".pkl") with open(config_path, "wb") as f: pickle.dump((args_dict, args_info), f) print('\t==> min_ppl {:4.4f} '.format(min_ppl))
def generate(args): """ Use the trained model for decoding Args args (argparse.ArgumentParser) """ if args.cuda and torch.cuda.is_available(): device = 0 use_cuda = True elif args.cuda and not torch.cuda.is_available(): print("You do not have CUDA, turning cuda off") device = -1 use_cuda = False else: device = -1 use_cuda = False #Load the vocab # vocab = du.load_vocab(args.vocab) vocab, _ = du.load_vocab(args.vocab) vocab2 = du.load_vocab(args.frame_vocab_address, is_Frame=True) eos_id = vocab.stoi[EOS_TOK] pad_id = vocab.stoi[PAD_TOK] if args.ranking: # default is HARD one, the 'Inverse Narrative Cloze' in the paper dataset = du.NarrativeClozeDataset(args.valid_narr, vocab, src_seq_length=MAX_EVAL_SEQ_LEN, min_seq_length=MIN_EVAL_SEQ_LEN, LM=False) print('ranking_dataset: ', len(dataset)) # Batch size during decoding is set to 1 batches = BatchIter(dataset, 1, sort_key=lambda x: len(x.actual), train=False, device=-1) else: # dataset = du.SentenceDataset(args.valid_data, vocab, src_seq_length=MAX_EVAL_SEQ_LEN, min_seq_length=MIN_EVAL_SEQ_LEN, add_eos=False) #put in filter pred later dataset = du.SentenceDataset(path=args.valid_data, path2=args.valid_frames, vocab=vocab, vocab2=vocab2, num_clauses=args.num_clauses, add_eos=False, is_ref=True, obsv_prob=0.0, print_valid=True) # Batch size during decoding is set to 1 batches = BatchIter(dataset, args.batch_size, sort_key=lambda x: len(x.text), train=False, device=-1) data_len = len(dataset) #Create the model with open(args.load, 'rb') as fi: if not use_cuda: model = torch.load(fi, map_location=lambda storage, loc: storage) else: model = torch.load(fi, map_location=torch.device('cuda')) if not hasattr(model.latent_root, 'nohier'): model.latent_root.set_nohier(args.nohier) #for backwards compatibility model.decoder.eval() model.set_use_cuda(use_cuda) #For reconstruction if args.perplexity: print('calculating perplexity') loss = calc_perplexity(args, model, batches, vocab, data_len) NLL = loss PPL = np.exp(loss) print("Chain-NLL = {}".format(NLL)) print("Chain-PPL = {}".format(PPL)) return PPL elif args.schema: generate_from_seed(args, model, batches, vocab, data_len) elif args.ranking: ranked_acc = do_ranking(args, model, batches, vocab, data_len, use_cuda) return ranked_acc else: # sample_outputs(model, vocab) reconstruct(args, model, batches, vocab)