def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] #+ (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB( data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ ) model = SentenceVAE( vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if torch.cuda.is_available(): model = model.cuda() if args.tensorboard_logging: writer = SummaryWriter(os.path.join('./',args.logdir, expierment_name(args,ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join('./',args.save_model_path,'VAE', ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/(1+np.exp(-k*(step-x0)))) elif anneal_function == 'linear': return min(1, step/x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) NLL_w_avg = NLL_loss/torch.sum(length).float() # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight,NLL_w_avg print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader( dataset=datasets[split], batch_size=args.batch_size, shuffle=split=='train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available() ) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight,NLL_w_avg = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss)/batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing # Avoid the .cat error !!! #print(loss.data) #print(tracker['ELBO']) loss_data = torch.tensor([loss.data.item()]) tracker['ELBO'] = torch.cat((tracker['ELBO'], loss_data)) #Orig: tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data),1) if args.tensorboard_logging: writer.add_scalar("%s/ELBO"%split.upper(), loss.data[0], epoch*len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration+1 == len(data_loader): print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f, NLL-word-Loss %9.4f" %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight,NLL_w_avg)) #split = 'invalid' #JUST TO DEBUG!!! if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) #ERROR HERE!!! tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()} if not os.path.exists(os.path.join('./dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('./dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file: json.dump(dump,dump_file) # save checkpoint if split == 'train' and epoch %10 ==0 : checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s"%checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) RANDOM_SEED = 42 dataset = load_dataset("yelp_polarity", split="train") TRAIN_SIZE = len(dataset) - 2_000 VALID_SIZE = 1_000 TEST_SIZE = 1_000 train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE, seed=RANDOM_SEED) train_dataset = train_test_split["train"] test_val_dataset = train_test_split["test"].train_test_split( train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED) val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset[ "test"] tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True) datasets = OrderedDict() datasets['train'] = TextDataset(train_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) datasets['valid'] = TextDataset(val_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) if args.test: datasets['text'] = TextDataset(test_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) print( f"Loading {args.model_name} model. Setting {args.trainable_layers} trainable layers." ) encoder = AutoModel.from_pretrained(args.model_name, return_dict=True) if not args.train_embeddings: for p in encoder.embeddings.parameters(): p.requires_grad = False encoder_layers = encoder.encoder.layer if args.trainable_layers > len(encoder_layers): warnings.warn( f"You are asking to train {args.trainable_layers} layers, but this model has only {len(encoder_layers)}" ) for layer in range(len(encoder_layers) - args.trainable_layers): for p in encoder_layers[layer].parameters(): p.requires_grad = False params = dict(vocab_size=datasets['train'].vocab_size, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional, max_sequence_length=args.max_sequence_length) model = SentenceVAE(encoder=encoder, tokenizer=tokenizer, **params) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) with open(os.path.join(save_model_path, 'train_args.json'), 'w') as f: json.dump(vars(args), f, indent=4) def kl_anneal_function(anneal_function, step, k, x0): if step <= x0: return args.initial_kl_weight if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0 - 2500)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum') def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).item()].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight params = [{ 'params': model.encoder.parameters(), 'lr': args.encoder_learning_rate }, { 'params': [ *model.decoder_rnn.parameters(), *model.hidden2mean.parameters(), *model.hidden2logv.parameters(), *model.latent2hidden.parameters(), *model.outputs2vocab.parameters() ] }] optimizer = torch.optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=(split == 'train'), num_workers=cpu_count(), pin_memory=torch.cuda.is_available(), collate_fn=DataCollator(tokenizer)) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['attention_mask'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.data.view(1, -1)), dim=0) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), NLL_loss.item() / batch_size, KL_loss.item() / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].tolist(), tokenizer=tokenizer) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean())) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences, the encoded latent space and generated sequences if split == 'valid': samples, _ = model.inference(z=tracker['z']) generated_sents = idx2word(samples.tolist(), tokenizer) sents = [{ 'original': target, 'generated': generated } for target, generated in zip(tracker['target_sents'], generated_sents)] dump = {'sentences': sents, 'z': tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file, indent=3) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) log_file = open("res.txt", "a") log_file.write(expierment_name(args, ts)) log_file.write("\n") graph_file = open("elbo-graph.txt", "a") graph_file.write(expierment_name(args, ts)) graph_file.write("\n") model = SentenceVAE(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) elif anneal_function == "softplus": return min(1, np.log(1 + np.exp(k * step))) elif anneal_function == "no": return 1 NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 val_lowest_elbo = 5000 val_accu_epoch = 0 val_min_epoch = 0 split_elbo = {"train": [], "valid": []} if args.test: split_elbo["test"] = [] split_loss = {"train": [], "valid": []} if args.test: split_loss["test"] = [] for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) if split != 'train': KL_weight = 1.0 loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data)) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.data[0], epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.data[0], NLL_loss.data[0] / batch_size, KL_loss.data[0] / batch_size, KL_weight)) split_loss[split].append([ loss.data[0], NLL_loss.data[0] / batch_size, KL_loss.data[0] / batch_size ]) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) split_elbo[split].append([torch.mean(tracker["ELBO"])]) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path) if split == 'valid': if torch.mean(tracker['ELBO']) < val_lowest_elbo: val_lowest_elbo = torch.mean(tracker['ELBO']) val_accu_epoch = 0 val_min_epoch = epoch else: val_accu_epoch += 1 if val_accu_epoch >= 3: if not args.test: exp_str = "" exp_str += "train_ELBO={}\n".format( split_elbo["train"][val_min_epoch]) exp_str += "valid_ELBO={}\n".format( split_elbo["valid"][val_min_epoch]) exp_str += "==========\n" log_file.write(exp_str) log_file.close() print(exp_str) graph_file.write("ELBO\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[0]) line += "\n" graph_file.write(line) graph_file.write("NLL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[1]) line += "\n" graph_file.write(line) graph_file.write("KL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[2]) line += "\n" graph_file.write(line) graph_file.close() exit() elif split == 'test' and val_accu_epoch >= 3: exp_str = "" exp_str += "train_ELBO={}\n".format( split_elbo["train"][val_min_epoch]) exp_str += "valid_ELBO={}\n".format( split_elbo["valid"][val_min_epoch]) exp_str += "test_ELBO={}\n".format( split_elbo["test"][val_min_epoch]) exp_str += "==========\n" log_file.write(exp_str) log_file.close() print(exp_str) graph_file.write("ELBO\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[0]) line += "\n" for s in splits: for i in split_elbo[s]: line += "{},".format(i[0]) line += "\n" graph_file.write(line) graph_file.write("NLL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[1]) line += "\n" graph_file.write(line) graph_file.write("KL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[2]) line += "\n" graph_file.write(line) graph_file.close() exit() if epoch == args.epochs - 1: exp_str = "" exp_str += "train_ELBO={}\n".format( split_elbo["train"][val_min_epoch]) exp_str += "valid_ELBO={}\n".format( split_elbo["valid"][val_min_epoch]) if args.test: exp_str += "test_ELBO={}\n".format( split_elbo["test"][val_min_epoch]) exp_str += "==========\n" log_file.write(exp_str) log_file.close() print(exp_str) graph_file.write("ELBO\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[0]) line += "\n" graph_file.write(line) graph_file.write("NLL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[1]) line += "\n" graph_file.write(line) graph_file.write("KL\n") line = "" for s in splits: for i in split_loss[s]: line += "{},".format(i[2]) line += "\n" graph_file.write(line) graph_file.close() exit()
def main(args): #print('start') ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] if args.tensorboard_logging: print('Tensorboard logging on') w_datasets, y_datasets = load_e2e(args.create_data, args.max_sequence_length, args.min_occ) 'datsets loaded' print((y_datasets[splits[0]].shape[1])) label_sequence_len = y_datasets[splits[0]].shape[1] print('lsl') print(y_datasets['train'].shape) model = SentenceJMVAE(vocab_size=w_datasets['train'].vocab_size, sos_idx=w_datasets['train'].sos_idx, eos_idx=w_datasets['train'].eos_idx, pad_idx=w_datasets['train'].pad_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, latent_size=args.latent_size, num_layers=args.num_layers, label_sequence_len=label_sequence_len, bidirectional=args.bidirectional) print('model created') if torch.cuda.is_available(): model = model.cuda() if args.tensorboard_logging: writer = SummaryWriter( os.path.join('./', args.logdir, 'JMVAE', expierment_name(args, ts))) writer.add_text("model_jmvae", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join('./', args.save_model_path, 'JMVAE', ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=w_datasets['train'].pad_idx) BCE = torch.nn.BCELoss(size_average=False) def loss_fn_plus(logp, logp2, target, target2, length, mean, logv, mean_w, logv_w, mean_y, logv_y, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) NLL_w_avg = NLL_loss / torch.sum(length).float() #Cross entropy loss BCE_loss = BCE(logp2, target2) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_loss_w = [ 0.5 * ((sigma0.exp() / sigma1.exp()).sum() + torch.sum( ((mu1 - mu0)**2) * (1 / torch.exp(sigma1))) - (mu0.size(0)) + sigma1.sum() - sigma0.sum()) for mu0, sigma0, mu1, sigma1 in zip(mean, logv, mean_w, logv_w) ] KL_loss_w = sum(KL_loss_w) #/len(KL_loss_w) KL_loss_y = [ 0.5 * ((sigma0.exp() / sigma1.exp()).sum() + torch.sum( ((mu1 - mu0)**2) * (1 / torch.exp(sigma1))) - (mu0.size(0)) + sigma1.sum() - sigma0.sum()) for mu0, sigma0, mu1, sigma1 in zip(mean, logv, mean_y, logv_y) ] KL_loss_y = sum(KL_loss_y) #/len(KL_loss_y) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, BCE_loss, KL_loss, KL_loss_w, KL_loss_y, KL_weight, NLL_w_avg print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 print('starting training') for epoch in range(args.epochs): for split in splits: print('split: ', split, '\tepoch: ', epoch) #print(split) #print((w_datasets[split][0])) #print(w_datasets['train']) data_loader = DataLoader( dataset=w_datasets[split], #y_datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) #print('Out dataloader received') tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): #print('new batch') #print('batch') batch_size = batch['input'].size(0) #print(iteration,batch['labels']) batch['labels'] = batch['labels'].float() for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) #print('labels preprocessed') # Forward pass logp, logp2, mean, logv, z, mean_w, logv_w, mean_y, logv_y = model( batch['input'], batch['labels'], batch['length']) #print('forward pass done') # loss calculation NLL_loss, BCE_loss, KL_loss, KL_loss_w, KL_loss_y, KL_weight, NLL_w_avg = loss_fn_plus( logp, logp2, batch['target'], batch['labels'], batch['length'], mean, logv, mean_w, logv_w, mean_y, logv_y, args.anneal_function, step, args.k, args.x0) #!!!! # MAYBE ADD WEIGHTS TO KL_W AND KL_Y BASED ON THEIR DIMENSIONALITY #!!! loss = (NLL_loss + args.bce_weight * BCE_loss + KL_weight * (KL_loss + args.alpha * (KL_loss_w + KL_loss_y))) / batch_size #print('loss calculated') # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 #print('backprop done') # bookkeepeing # Avoid the .cat error !!! #print(loss.data) #print(tracker['ELBO']) loss_data = torch.cuda.FloatTensor([ loss.data.item() ]) if torch.cuda.is_available() else torch.tensor( [loss.data.item()]) tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss_data) ) #Orig: tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data),1) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.data[0], epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/BCE Loss" % split.upper(), BCE_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss-w" % split.upper(), KL_loss_w.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss-y" % split.upper(), KL_loss_y.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, BCE-Loss %9.4f, KL-Loss-joint %9.4f, KL-Loss-w %9.4f, KL-Loss-y %9.4f, KL-Weight %6.3f, NLL-word-Loss %9.4f" % (split.upper(), iteration, len(data_loader) - 1, loss.data[0], NLL_loss.data[0] / batch_size, BCE_loss.data[0] / batch_size, KL_loss.data[0] / batch_size, KL_loss_w.data[0] / batch_size, KL_loss_y.data[0] / batch_size, KL_weight, NLL_w_avg.data[0])) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=w_datasets['train'].get_i2w(), pad_idx=w_datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('./dumps', ts)): os.makedirs('./dumps/' + ts) with open( os.path.join('./dumps/' + ts + '/valid_E%i.json' % epoch), 'w+') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train' and epoch % 10 == 0: checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): #create dir name ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) ts = ts.replace(':', '-') #prepare dataset splits = ['train', 'valid'] + (['test'] if args.test else []) #create dataset object datasets = OrderedDict() # create test and train split in data, also preprocess for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) #get training params params = dict(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) #init model object model = SentenceVAE(**params) if torch.cuda.is_available(): model = model.cuda() #logging print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) # make dir save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) #write params to json and save with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) #defining function that returns disentangling weight used for KL loss at each input step def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) #defining NLL loss to measure accuracy of the decoding NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum') #this functiom is used to compute the 2 loss terms and KL loss weight def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).item()].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): #do train and then test for split in splits: #create dataloader data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) #tracker used to track the loss tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() #start batch wise training/testing for iteration, batch in enumerate(data_loader): #get batch size batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) # final loss calculation loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() #flush grads loss.backward() #run bp optimizer.step() #run gd step += 1 # bookkeepeing tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.data.view(1, -1)), dim=0) #logging of losses if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) # if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), NLL_loss.item() / batch_size, KL_loss.item() / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean())) #more logging if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ################ config your params here ######################## # ortho = False # attention = False # hspace_classifier = False # diversity = False # do not try this yet, need to fix bugs # create dir name ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) ts = ts.replace(':', '-') ts = ts+'-'+args.dataset if(args.ortho): ts = ts+'-ortho' if(args.hspace): ts = ts+'-hspace' if(args.attention): ts = ts+'-self-attn' if(args.dataset == "multitask"): print("Running multitask dataset!") vae_model = SentenceVaeMultiTask dataset = SnliYelp if(args.dataset == "snli"): print("Running SNLI!") vae_model = SentenceVaeSnli dataset = SNLI if(args.dataset == "yelp"): print("Running Yelp!") vae_model = SentenceVaeYelp dataset = Yelpd # prepare dataset splits = ['train', 'test'] # create dataset object datasets = OrderedDict() # create test and train split in data, also preprocess for split in splits: print("creating dataset for: {}".format(split)) datasets[split] = dataset( split=split, create_data=args.create_data, min_occ=args.min_occ ) i2w = datasets['train'].get_i2w() w2i = datasets['train'].get_w2i() # get training params params = dict( vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=datasets['train'].max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional, ortho=args.ortho, attention=args.attention, hspace_classifier=args.hspace, diversity=args.diversity ) # init model object model = vae_model(**params) if torch.cuda.is_available(): model = model.cuda() # logging print(model) if args.tensorboard_logging: writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) # make dir save_model_path = os.path.join(datasets["train"].save_model_path, ts) os.makedirs(save_model_path) # write params to json and save with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) # defining function that returns disentangling weight used for KL loss at each input step def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/(1+np.exp(-k*(step-x0)))) elif anneal_function == 'linear': return min(1, step/x0) # defining NLL loss to measure accuracy of the decoding NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum') loss_fn_2 = F.cross_entropy # this functiom is used to compute the 2 loss terms and KL loss weight def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :datasets["train"].max_sequence_length].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor step = 0 overall_losses = defaultdict(dict) loss_at_epoch = { 'nll_loss': 0.0, 'kl_loss': 0.0, 'style_loss': 0.0, 'content_loss': 0.0, 'diversity_loss': 0.0, 'hspace_loss': 0.0, 'nll_loss_test': 0.0, 'kl_loss_test': 0.0, 'style_loss_test': 0.0, 'content_loss_test': 0.0, 'diversity_loss_test': 0.0, 'hspace_loss_test': 0.0 } for epoch in range(args.epochs): # do train and then test for split in splits: # create dataloader data_loader = DataLoader( dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available() ) # tracker used to track the loss tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() # start batch wise training/testing for iteration, batch in enumerate(data_loader): # get batch size batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # try sample # print(idx2word(batch['target'][0:1], i2w=i2w, pad_idx=w2i['<pad>'])) # print(batch['label'][0]) # continue # print("neg: {}, pos: {}".format(style_preds[0:1,0], style_preds[0:1,1])) # Forward pass logp, final_mean, final_logv, final_z, style_preds, content_preds, hspace_preds, diversity_loss = model(batch['input'], batch['length'], batch['label'], batch['bow']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], final_mean, final_logv, args.anneal_function, step, args.k, args.x0) style_loss = nn.MSELoss()(style_preds, batch['label'].type(torch.FloatTensor).cuda()) #classification loss content_loss = nn.MSELoss()(content_preds, batch['bow'].type(torch.FloatTensor).cuda()) #classification loss if(hspace_preds is None): hspace_classifier_loss = 0 else: hspace_classifier_loss = nn.MSELoss()(hspace_preds, batch['label'].type(torch.FloatTensor).cuda()) # final loss calculation loss = (NLL_loss + KL_weight * KL_loss) / batch_size + 1000 * style_loss + 1000*content_loss # loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() # flush grads if(args.diversity): loss.backward(retain_graph = True) # run bp diversity_loss.backward() else: loss.backward() # run bp optimizer.step() # run gd step += 1 overall_losses[len(overall_losses)] = loss_at_epoch # bookkeeping tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data.view(1, -1)), dim=0) # logging of losses if args.tensorboard_logging: writer.add_scalar( "%s/ELBO" % split.upper(), loss.item(), epoch*len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch*len(data_loader) + iteration) # if iteration % args.print_every == 0 or iteration+1 == len(data_loader): print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f, Style-Loss %9.4f, Content-Loss %9.4f, Hspace-Loss %9.4f, Diversity-Loss %9.4f" % (split.upper(), iteration, len(data_loader)-1, loss.item(), NLL_loss.item()/batch_size, KL_loss.item()/batch_size, KL_weight, style_loss, content_loss, hspace_classifier_loss, diversity_loss)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(),pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean())) # more logging if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('dumps/'+ts+'/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join( save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path) # update losses log if(split == "train"): loss_at_epoch['nll_loss'] = float(NLL_loss/args.batch_size) loss_at_epoch['kl_loss'] = float(KL_loss) loss_at_epoch['style_loss'] = float(style_loss) loss_at_epoch['content_loss'] = float(content_loss) loss_at_epoch['diversity_loss'] = float(diversity_loss) loss_at_epoch['hspace_loss'] = float(hspace_classifier_loss) else: loss_at_epoch['nll_loss_test'] = float(NLL_loss/args.batch_size) loss_at_epoch['kl_loss_test'] = float(KL_loss) loss_at_epoch['style_loss_test'] = float(style_loss) loss_at_epoch['content_loss_test'] = float(content_loss) loss_at_epoch['diversity_loss_test'] = float(diversity_loss) loss_at_epoch['hspace_loss_test'] = float(hspace_classifier_loss) # write losses to json with open(os.path.join(save_model_path, 'losses.json'), 'w') as f: json.dump(overall_losses, f, indent=4)
def main(args): # Load the vocab with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) # Initialize semantic loss sl = Semantic_Loss() datasets = OrderedDict() for split in splits: datasets[split] = PTB( data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ ) params = dict( vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) model = SentenceVAE(**params) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/(1+np.exp(-k*(step-x0)))) elif anneal_function == 'linear': return min(1, step/x0) def perplexity_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/ 1+np.exp(-k*(step-x0))) elif anneal_function == 'linear': return min(1, (step/x0)) NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum') def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0, \ batch_perplexity, perplexity_anneal_function): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).item()].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) # Perplexity perp_loss = batch_perplexity perp_weight = perplexity_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor step = 0 for epoch in range(args.epochs): # Keep track of epoch loss epoch_loss = [] for split in splits: data_loader = DataLoader( dataset=datasets[split], batch_size=args.batch_size, shuffle=split=='train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available() ) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() batch_t_start = None for iteration, batch in enumerate(data_loader): if batch_t_start: batch_run_time = time.time() - batch_t_start # print("Batch run time: " + str(batch_run_time)) batch_t_start = time.time() batch_size = batch['input_sequence'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Get the original sentences in this batch batch_sentences = idx2word(batch['input_sequence'], i2w=i2w, pad_idx=w2i['<pad>']) # Remove the first tag batch_sentences = [x.replace("<sos>", "") for x in batch_sentences] # Forward pass (logp, mean, logv, z), states = model(**batch) # Choose some random pairs of samples within the batch # to get latent representations for batch_index_pairs = list(itertools.combinations(np.arange(batch_size), 2)) random.shuffle(batch_index_pairs) batch_index_pairs = batch_index_pairs[:args.perplexity_samples_per_batch] batch_perplexity = [] # If we start the perplexity start_perplexity = epoch > 10 # If we should have perplexity loss if start_perplexity and args.perplexity_loss: # For each pair, get the intermediate representations in the latent space for index_pair in batch_index_pairs: with torch.no_grad(): z1_hidden = states['z'][index_pair[0]].cpu() z2_hidden = states['z'][index_pair[1]].cpu() z_hidden = to_var(torch.from_numpy(interpolate(start=z1_hidden, end=z2_hidden, steps=1)).float()) if args.rnn_type == "lstm": with torch.no_grad(): z1_cell_state = states['z_cell_state'].cpu().squeeze()[index_pair[0]] z2_cell_state = states['z_cell_state'].cpu().squeeze()[index_pair[1]] z_cell_states = \ to_var(torch.from_numpy(interpolate(start=z1_cell_state, end=z2_cell_state, steps=1)).float()) samples, _ = model.inference(z=z_hidden, z_cell_state=z_cell_states) else: samples, _ = model.inference(z=z_hidden, z_cell_state=None) # Check interpolated sentences interpolated_sentences = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']) # For each sentence, get the perplexity and show it perplexities = [] for sentence in interpolated_sentences: perplexities.append(sl.get_perplexity(sentence)) avg_sample_perplexity = sum(perplexities) / len(perplexities) batch_perplexity.append(avg_sample_perplexity) # Calculate batch perplexity avg_batch_perplexity = sum(batch_perplexity) / len(batch_perplexity) # loss calculation NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, \ args.k, args.x0, avg_batch_perplexity, perplexity_anneal_function) loss = ((NLL_loss + KL_weight * KL_loss) / batch_size) + (perp_loss * perp_weight) else: # Epochs < X, so train without perplexity # loss calculation NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, \ args.k, args.x0, 0, perplexity_anneal_function) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # Turn model back into train, since inference changed to eval if split == 'train': model.train() else: model.eval() # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # Add loss epoch_loss.append(loss.item()) # bookkeepeing tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data.view(1, -1)), dim=0) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch*len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch*len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration+1 == len(data_loader): print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f, Perp-loss %9.4f, Perp-weight %6.3f" % (split.upper(), iteration, len(data_loader)-1, loss.item(), NLL_loss.item()/batch_size, KL_loss.item()/batch_size, KL_weight, perp_loss, perp_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean())) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = {'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('dumps/'+ts+'/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump,dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.localtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ, use_bert=args. False) model = SentenceVAE(alphabet_size=datasets['train'].alphabet_size, vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) print("Saving model to directory: " + save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) def word_weight_function(step, k, x0): return float(1 / (1 + np.exp(-k * (step - x0)))) NLL = torch.nn.NLLLoss(reduction='sum', ignore_index=datasets['train'].pad_idx) def loss_fn(def_logp, word_logp, def_target, def_length, word_target, word_length, mean, logv): # cut-off unnecessary padding from target definition, and flatten def_target = def_target[:, :torch.max(def_length).item()].contiguous( ).view(-1) def_logp = def_logp.view(-1, def_logp.size(2)) # Negative Log Likelihood def_NLL_loss = NLL(def_logp, def_target) # cut off padding for words word_target = word_target[:, :torch.max(word_length).item( )].contiguous().view(-1) word_logp = word_logp.view(-1, word_logp.size(2)) # Word NLL word_NLL_loss = NLL(word_logp, word_target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) return def_NLL_loss, word_NLL_loss, KL_loss def get_weights(anneal_function, step, k, x0): # for logistic function, k = growth rate KL_weight = kl_anneal_function(anneal_function, step, k, x0) word_weight = word_weight_function(step, k, x0) return {'def': 1, 'word': word_weight, 'kl': KL_weight} optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model = model.train() else: model = model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass [def_logp, word_logp], mean, logv, z = model(batch['input'], batch['length'], batch['word_length']) # loss calculation def_NLL_loss, word_NLL_loss, KL_loss = loss_fn( def_logp, word_logp, batch['target'], batch['length'], batch['word'], batch['word_length'], mean, logv) weights = get_weights(args.anneal_function, step, args.k, args.x0) loss = (weights['def'] * def_NLL_loss + weights['word'] * word_NLL_loss + weights['kl'] * KL_loss) / batch_size mean_logv = torch.mean(logv) # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.detach().unsqueeze(0))) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch * len(data_loader) + iteration) writer.add_scalar("%s/Def NLL Loss" % split.upper(), def_NLL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/Word NLL Loss" % split.upper(), word_NLL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), weights['kl'], epoch * len(data_loader) + iteration) writer.add_scalar("%s/Word Weight" % split.upper(), weights['word'], epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, Def NLL-Loss %9.4f, Word NLL-Loss %9.4f Word-Weight %6.3f, KL-Loss %9.4f, KL-Weight %6.3f KL-VAL %9.4f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), def_NLL_loss.item() / batch_size, word_NLL_loss.item() / batch_size, weights['word'], KL_loss.item() / batch_size, weights['kl'], mean_logv)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'], i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: # datasets[split] = BGoogle( # data_dir=args.data_dir, # split=split, # create_data=args.create_data, # batch_size=args.batch_size , # max_sequence_length=args.max_sequence_length, # min_occ=args.min_occ # ) datasets[split] = Amazon(data_dir=args.data_dir, split=split, create_data=args.create_data, batch_size=args.batch_size, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) model = SentenceVAE(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if torch.cuda.is_available(): model = model.cuda() print(model) tokenizer = TweetTokenizer(preserve_case=False) vocab_file = "amazon.vocab.json" with open(os.path.join(args.data_dir, vocab_file), 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) # save_model_path = os.path.join(args.save_model_path, ts) save_model_path = args.save_model_path if not os.path.exists(save_model_path): os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 save_mode = True last_ELBO = 1e32 for epoch in range(args.epochs): print("+" * 20) # f_test_example(model, tokenizer, w2i, i2w) for split in splits: # data_loader = DataLoader( # dataset=datasets[split], # batch_size=args.batch_size, # shuffle=split=='train', # num_workers=cpu_count(), # pin_memory=torch.cuda.is_available() # ) batch_size = args.batch_size tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() # for iteration, batch in enumerate(data_loader): iteration = 0 iteration_total = datasets[split].batch_num print("batch_num", iteration_total) for input_batch_tensor, target_batch_tensor, length_batch_tensor in datasets[ split]: if torch.is_tensor(input_batch_tensor): input_batch_tensor = to_var(input_batch_tensor) if torch.is_tensor(target_batch_tensor): target_batch_tensor = to_var(target_batch_tensor) if torch.is_tensor(length_batch_tensor): length_batch_tensor = to_var(length_batch_tensor) # batch_size = batch['input'].size(0) # for k, v in batch.items(): # if torch.is_tensor(v): # batch[k] = to_var(v) # Forward pass # logp, mean, logv, z = model(batch['input'], batch['length']) logp, mean, logv, z = model(input_batch_tensor, length_batch_tensor) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn( logp, target_batch_tensor, length_batch_tensor, mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 iteration += 1 # bookkeepeing # print("elbo", tracker['ELBO']) # print("loss", loss) if iteration == 0: tracker['ELBO'] = loss.data tracker['ELBO'] = tracker['ELBO'].view(1) else: tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.view(1))) if args.tensorboard_logging: # print(loss.data) writer.add_scalar("%s/ELBO" % split.upper(), loss.data.item(), epoch * iteration_total + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data.item() / batch_size, epoch * iteration_total + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data.item() / batch_size, epoch * iteration_total + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * iteration_total + iteration) if iteration % args.print_every == 0 or iteration + 1 == iteration_total: print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, iteration_total - 1, loss.data.item(), NLL_loss.data.item() / batch_size, KL_loss.data.item() / batch_size, KL_weight)) # if split == 'valid': # if 'target_sents' not in tracker: # tracker['target_sents'] = list() # tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) # # print("z", tracker['z'], z) # tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) # break print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) cur_ELBO = torch.mean(tracker['ELBO']) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), cur_ELBO, epoch) if split == "valid": if cur_ELBO < last_ELBO: save_mode = True else: save_mode = False last_ELBO = cur_ELBO # save a dump of all sentences and the encoded latent space # if split == 'valid': # dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()} # if not os.path.exists(os.path.join('dumps', ts)): # os.makedirs('dumps/'+ts) # with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file: # json.dump(dump,dump_file) # save checkpoint if split == 'train': # checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch)) checkpoint_path = os.path.join(save_model_path, "best.pytorch") if save_mode == True: torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) ptb = PTB(vocab_file=args.vocab_file, train_file=args.train_file, train_with_vocab=False, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) datasets = PTBDataset(ptb) print('done preprocessing data') model = SentenceVAE(vocab_size=datasets.vocab_size, sos_idx=datasets.sos_idx, eos_idx=datasets.eos_idx, pad_idx=datasets.pad_idx, unk_idx=datasets.unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) model.ptb = ptb if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets.pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten #target = target[:, :torch.max(length).data[0]].contiguous().view(-1) target = target[:, :torch.max(length).data].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 model.train() split = 'train' for epoch in range(args.epochs): data_loader = DataLoader(dataset=datasets, batch_size=args.batch_size, shuffle=True, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(list) for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass #logp, mean, logv, z = model(batch['input'], batch['length']) logp, mean, logv, z, encoder_last = model(batch['input'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'].append(loss.data.cpu().numpy().tolist()) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.data, epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.data, NLL_loss.data / batch_size, KL_loss.data / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets.get_i2w(), pad_idx=datasets.pad_idx) tracker['z'].append(z.data) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, np.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), np.mean(tracker['ELBO']), epoch) ''' # save a dump of all sentences and the encoded latent space if split == 'valid': dump = {'target_sents':tracker['target_sents'], 'z':tracker['z']} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file: json.dump(dump,dump_file) ''' # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) joblib.dump(model.cpu(), checkpoint_path) print("Model saved at %s" % checkpoint_path) if torch.cuda.is_available(): model.cuda()
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.max_sequence_length, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.max_sequence_length, on_memory=args.on_memory) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None splits = ['train', 'test'] data_loaders = { 'train': train_data_loader, 'test': test_data_loader } model = SentenceVAE( vocab_size=len(vocab), sos_idx=vocab.sos_index, eos_idx=vocab.eos_index, pad_idx=vocab.pad_index, unk_idx=vocab.unk_index, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args,ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path) if not os.path.exists(save_model_path): os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/(1+np.exp(-k*(step-x0)))) elif anneal_function == 'linear': return min(1, step/x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=vocab.pad_index) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = data_loaders[split] tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() correct = 0 close = 0 total = 0 for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['raw_length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['raw_length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss)/batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 correct += logp.argmax(dim=1).eq(batch['target']).sum().item() close += torch.mul(logp.argmax(dim=1).ge(batch["target"]-10), logp.argmax(dim=1).le(batch["target"]+10)).sum().item() total += batch['target'].nelement() # bookkeepeing tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.view(1,))) if args.tensorboard_logging: writer.add_scalar("%s/ELBO"%split.upper(), loss.data[0], epoch*len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration+1 == len(data_loader): print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" %(split.upper(), iteration, len(data_loader)-1, loss.item(), NLL_loss.item()/batch_size, KL_loss.item()/batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['raw'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f, acc %f, clo %f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']), correct/total, close/total)) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file: json.dump(dump,dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s"%checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] datasets = OrderedDict() for split in splits: datasets[split] = PoetryDataset( data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) model = SentenceVAE(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional, condition_size=7) if torch.cuda.is_available(): model = model.cuda() if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def calculate_bleu_scores(original, decoded): reference = original.split(' ') hypothesis = decoded.split(' ') return nltk.translate.bleu_score.sentence_bleu([reference], hypothesis) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length)].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): total_BLEU_score = 0 for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=0, pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model( batch['input'], batch['length'], condition=batch['category'].float()) # logp, mean, logv, z = model(batch['input'], batch['length'], condition=None) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.data.unsqueeze(0))) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.data[0], epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.data.item(), NLL_loss.data.item() / batch_size, KL_loss.data.item() / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) # Calculate BLEU score decoded = torch.argmax(logp, dim=-1) for i in range(decoded.shape[0]): decoded_poem = idx2word( [decoded[i]], i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx)[0] original_poem = idx2word( [batch['target'].data[i]], i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx)[0] total_BLEU_score += calculate_bleu_scores( original_poem, decoded_poem) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if split == 'valid': print("Average BLEU {}".format(total_BLEU_score / decoded.shape[0])) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
def main(args): ts = time.strftime('%Y-%b-%d-%H-%M-%S', time.gmtime()) # Load dataset splits = ['train', 'valid'] datasets = OrderedDict() for split in splits: datasets[split] = Data(split, args.num_region, args.batch_size, args.site, args.subject, args.seq_len, args.embedding_size, args.cut_start, args.lines) # load model model = LSTM_VAE( embedding_size=args.embedding_size, rnn_type=args.rnn_type, # gru hidden_size=args.hidden_size, # 256 word_dropout=args.word_dropout, # 0 embedding_dropout=args.embedding_dropout, # 0.5 latent_size=args.latent_size, # 8 num_layers=args.num_layers, # 1 bidirectional=args.bidirectional # false ) if torch.cuda.is_available(): model = model.cuda() print(model) """ SentenceVAE( (embedding_dropout): Dropout(p=0.5) (encoder_rnn): GRU(32, 256, batch_first=True) (decoder_rnn): GRU(32, 256, batch_first=True) (hidden2mean): Linear(in_features=256, out_features=8, bias=True) (hidden2logv): Linear(in_features=256, out_features=8, bias=True) (latent2hidden): Linear(in_features=16, out_features=256, bias=True) ) """ if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) # NLL = torch.nn.NLLLoss(size_average=False) mse_loss = torch.nn.MSELoss() cos_loss = torch.nn.CosineSimilarity(dim=-1) def loss_fn(output, target, mean, logvar): mse = mse_loss(output, target) cos = torch.mean(1 - cos_loss(output, target)) KL_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp()) return cos, mse, KL_loss tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 learning_rate = args.learning_rate for epoch in range(1, args.epochs + 1): if epoch > args.decay_epoch: learning_rate = learning_rate * args.learning_rate_decay optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = args.batch_size batch = batch.type(torch.float32) length = [args.seq_len for _ in range(args.batch_size)] if torch.is_tensor(batch): batch = to_var(batch) target = batch.clone() # Forward pass output, mean, logvar, z = model(batch, length) # loss calculation cos, mse, KL_loss = loss_fn(output, target, mean, logvar) # print(cos.item(), mse.item(), KL_loss.item()) loss = (cos + mse + KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # book keeping tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.detach().reshape(1))) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch * len(data_loader) + iteration) writer.add_scalar("%s/Cos Loss" % split.upper(), cos.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/MSE Loss" % split.upper(), mse.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, Cos-Loss %9.4f, MSE-Loss %9.4f, KL-Loss %9.4f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), cos.item() / batch_size, mse.item() / batch_size, KL_loss.item() / batch_size)) # if split == 'valid': # if 'target_sents' not in tracker: # tracker['target_sents'] = list() # tracker['target_sents'] += idx2word(batch['target'].detach(), i2w=datasets['train'].get_i2w(), # pad_idx=datasets['train'].pad_idx) # tracker['z'] = torch.cat((tracker['z'], z.detach()), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # # save a dump of all sentences and the encoded latent space # if split == 'valid': # dump = {'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist()} # if not os.path.exists(os.path.join('dumps', ts)): # os.makedirs('dumps/' + ts) # with open(os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: # json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path) # save target & output for last validation batch if (epoch == args.epochs) and (split == "valid"): save = { 'target': target.cpu().detach().numpy().tolist(), 'output': output.cpu().detach().numpy().tolist() } with io.open('./{}_save.json'.format(args.site), 'wb') as data_file: data = json.dumps(save, ensure_ascii=False) data_file.write(data.encode('utf8', 'replace')) # save latent space latent = [] for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) model.eval() for iteration, batch in enumerate(data_loader): batch = batch.type(torch.float32) length = [args.seq_len for _ in range(args.batch_size)] if torch.is_tensor(batch): batch = to_var(batch) # Forward pass output, mean, logv, z = model(batch, length) # save latent space for both training and validation batch latent.append(z.cpu().detach().numpy().tolist()) latent = np.array(latent).reshape(args.subject, args.num_region, args.seq_len, args.latent_size) print(np.shape(latent)) with io.open('./{}_latent.json'.format(args.site), 'wb') as data_file: data = json.dumps(latent.tolist(), ensure_ascii=False) data_file.write(data.encode('utf8', 'replace'))
def main(args): # create dir name ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) ts = ts.replace(':', '-') ts = ts + '-' + args.dataset if (args.attention): ts = ts + '-self-attn' ts = ts + "-" + str(args.epochs) if (args.dataset == "yelp"): print("Running Yelp!") dataset = Yelp # prepare dataset splits = ['train', 'test'] # create dataset object datasets = OrderedDict() # create test and train split in data, also preprocess for split in splits: print("creating dataset for: {}".format(split)) datasets[split] = dataset(split=split, create_data=args.create_data, min_occ=args.min_occ) i2w = datasets['train'].get_i2w() w2i = datasets['train'].get_w2i() # print(type(int(datasets['train'].yelp_max_sequence_length))) max_sequence_length = datasets['train'].max_sequence_length # get training params params = dict( vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, num_layers=args.num_layers, bidirectional=args.bidirectional, attention=args.attention, dataset=args.dataset, ) # init model object model = BinaryClassifier(**params) if torch.cuda.is_available(): model = model.cuda() # logging print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) # make dir save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) # write params to json and save with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 overall_losses = defaultdict(dict) loss_at_epoch = {'loss': 0.0, 'acc': 0.0} for epoch in range(args.epochs): # do train and then test for split in splits: # create dataloader data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # tracker used to track the loss tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() # start batch wise training/testing for iteration, batch in enumerate(data_loader): # get batch size batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass preds = model(batch['input'], batch['length']) # ae loss calculation loss = nn.BCELoss()(preds, batch['label'].type( torch.FloatTensor).cuda()) # backward + optimization if split == 'train': optimizer.zero_grad() # flush grads loss.backward() optimizer.step() step += 1 # calculate accruracies preds = torch.argmax(preds, dim=1) ground_truth = torch.argmax(batch['label'], dim=1) acc = (preds == ground_truth).sum() / batch_size # try sample to verify style classifier is working # print(idx2word(batch['target'][0:1], i2w=i2w, pad_idx=w2i['<pad>'])) # print(batch['label'][0]) # print("neg: {}, pos: {}".format(style_preds[0:1,0], style_preds[0:1,1])) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "-----------------------------------------------------------------------" ) print("%s Batch %04d/%i, Loss %9.4f, Acc %9.4f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), acc)) # save checkpoint if split == 'train': loss_at_epoch['loss'] = float(loss) loss_at_epoch['acc'] = float(acc) overall_losses[len(overall_losses)] = loss_at_epoch checkpoint_path = os.path.join(args.save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path) # write losses to json with open(os.path.join(args.save_model_path, 'losses.json'), 'w') as f: json.dump(overall_losses, f, indent=4)