class Tester(object): def __init__(self, config): self.content_images = glob.glob((config.exp_content_dir + '/*/*.jpg')) #+ '_resized/*')) self.encoder = Encoder().cuda() self.decoder = Decoder() self.keyencoder = KeyEncoder().cuda() self.decoder.load_state_dict(torch.load('./decoder.pth')) self.decoder = self.decoder.cuda() self.keyencoder.load_state_dict(torch.load('./key.pth')) self.keyencoder = self.keyencoder.cuda() if config.attention == 'soft': self.AsyAtt = AsyAtt() else: self.AsyAtt = AsyAttHard() S_path = os.path.join(config.style_dir, str(config.S)) style_images = glob.glob((S_path + '/*.jpg')) s = Image.open(style_images[0]) s = trans(s).cuda() self.style_image = s.unsqueeze(0) self.style_target = torch.stack([s for i in range(config.batch_size)],0) def test(self): self.encoder.eval() self.decoder.eval() with torch.no_grad(): style_val = self.encoder(self.style_image) style_key = self.keyencoder(style_val) for filename in self.content_images: name = str(filename).split("test_images")[-1][1:].replace("\\", "-") name = name.replace("/", "-") c = Image.open(filename) c_tensor = trans(c).unsqueeze(0).cuda() val = self.encoder(c_tensor) key = self.keyencoder(val) content_feature = self.AsyAtt(style_key[0], style_val[0], key, val) out = self.decoder(content_feature) out = denorm(out).to('cpu')[0] c_tensor = denorm(c_tensor).to('cpu')[0] if out.shape[1] > c_tensor.shape[1]: c_tensor = torch.cat([c_tensor, torch.zeros([c_tensor.shape[0],out.shape[1]-c_tensor.shape[1],c_tensor.shape[2]])],1) elif out.shape[1] < c_tensor.shape[1]: out = torch.cat([out, torch.zeros([out.shape[0],c_tensor.shape[1]-out.shape[1],out.shape[2]])],1) save_image(torch.cat([out, c_tensor], 2), os.path.join('./logs/test', name))
def convert(cfg): dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list)) with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load( wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def main(): args = check_argv() # Code indices code_indices_fn = Path(args.code_indices_fn) print("Reading: {}".format(code_indices_fn)) code_indices = np.loadtxt(code_indices_fn, dtype=np.int) # Speakers with open(Path("datasets/2019/english/speakers.json")) as f: speakers = sorted(json.load(f)) # Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(in_channels=80, channels=768, n_embeddings=512, embedding_dim=64, jitter=0.5) decoder = Decoder( in_channels=64, conditioning_channels=128, n_speakers=102, speaker_embedding_dim=64, mu_embedding_dim=256, rnn_channels=896, fc_channels=256, bits=8, hop_length=160, ) decoder.to(device) print("Reading: {}".format(args.checkpoint)) checkpoint_path = args.checkpoint checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() # Codes embedding = encoder.codebook.embedding.cpu().numpy() codes = np.array([embedding[code_indices]]) # Synthesize z = torch.FloatTensor(codes).to(device) speaker = torch.LongTensor([speakers.index(args.speaker)]).to(device) with torch.no_grad(): output = decoder.generate(z, speaker) wav_fn = Path(code_indices_fn.stem).with_suffix(".wav") print("Writing: {}".format(wav_fn)) librosa.output.write_wav(wav_fn, output.astype(np.float32), sr=16000)
def sample(load_dir: str, save_dir: str, use_gpu: bool) -> None: ''' Sample the FantasyMapGAN for new maps. Saves the generated images to `save_dir`. Parameters ---------- load_dir: str folder to load network weights from save_dir: str folder to save network weights to use_gpu: bool Set to true to run training on GPU, otherwise run on CPU ''' # Network model = Decoder() model = model.eval() if use_gpu: model = model.cuda() if load_dir: fs = glob(os.path.join(load_dir, '*_dec.pth')) fs.sort(key=os.path.getmtime) model.load_state_dict(torch.load(fs[-1])) # Generate z = torch.randn((1, model.latent_dim)) x = model.forward(z) # Save save_path = os.path.join(save_dir, str(uuid.uuid1()) + '.png') save_image(x.squeeze(), save_path)
def initialize_for_test(params): data_loader = get_loader(params, mode='test') encoder_file = os.path.join(params.encoder_save, 'epoch-%d.pkl' % params.num_epochs) decoder_file = os.path.join(params.decoder_save, 'epoch-%d.pkl' % params.num_epochs) vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = Encoder(params) decoder = Decoder(params, vocab_size) encoder.eval() decoder.eval() # Load the trained weights. encoder.load_state_dict(torch.load(encoder_file)) decoder.load_state_dict(torch.load(decoder_file)) encoder.to(params.device) decoder.to(params.device) return data_loader, encoder, decoder
def main(test_img_path): options = parse_args() is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) for checkpoint_path in options.checkpoint: checkpoint_name, _ = os.path.splitext( os.path.basename(checkpoint_path)) checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda) if checkpoint_path else default_checkpoint) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") test_img = Image.open(test_img_path) test_img = test_img.convert("RGB") enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device) dec = Decoder( 1, low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.eval() dec.eval() result = evaluate( enc, dec, test_img=test_img, device=device, checkpoint=checkpoint, beam_width=options.beam_width, prefix=options.prefix, ) print(result)
def main(): data_set = F2EDataSet(max_length=max_seq_len) loader = DataLoader(data_set, batch_size=batch_size, shuffle=True) encoder = Encoder(data_set.in_lang.token_n, embed_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, drop_prob=drop_prob).to(device) decoder = Decoder(vocab_size=data_set.out_lang.token_n, embed_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, attention_size=attention_size, drop_prob=drop_prob).to(device) enc_optimizer = optim.Adam(encoder.parameters(), lr=lr) dec_optimizer = optim.Adam(decoder.parameters(), lr=lr) criteon = nn.CrossEntropyLoss(reduction='none').to(device) random_sample_sentences = data_set.random_sample(k=random_sample_k) sample_in_indices = [] for in_sentence, out_sentence in random_sample_sentences: sample_in_indices.append( data_set.convert_token_to_index(data_set.in_lang, in_sentence)) # sample_in_indices: shape[random_sample_k, max_len], dtype: int64 sample_in_indices = torch.LongTensor(sample_in_indices).to(device) # sample_in_indices: [random_sample_k, 1, max_len] sample_in_indices = torch.unsqueeze(sample_in_indices, dim=1) for epoch in range(num_epochs): total_loss = 0 encoder.train() decoder.train() for batch_idx, (in_seq, out_seq) in enumerate(loader): this_batch_size = in_seq.shape[0] # in_seq, out_seq shape: [batch_size, max_len], dtype = int64 in_seq, out_seq = in_seq.to(device), out_seq.to(device) # enc_outputs of shape (seq_len, batch, num_directions * hidden_size) # enc_hidden of shape (num_layers * num_directions, batch, hidden_size) enc_outputs, enc_hidden = encoder( in_seq, encoder.init_hidden(this_batch_size, device=device)) # 解码器在最初时间步的输入是BOS # dec_input: [batch_size, 1] dec_input = decoder.init_input(this_batch_size, device=device) # initialize hidden state of decoder # dec_hidden: [num_layers, batch_size, hidden_size] dec_hidden = decoder.init_hidden(enc_hidden) # mask [batch_size] mask = torch.ones(this_batch_size, device=device) eos = torch.LongTensor([2] * this_batch_size).to(device) pad = torch.zeros(this_batch_size).to(device) num_not_pad_tokens = 0 loss = 0 for y in torch.transpose(out_seq, 0, 1): dec_output, dec_hidden = decoder(dec_input, dec_hidden, enc_outputs) loss += torch.sum((criteon(dec_output, y) * mask), dim=0) # y: [batch_size] => [batch_size, 1] dec_input = torch.unsqueeze(y, dim=1) num_not_pad_tokens += torch.sum(mask, dim=0) # 当遇到EOS时,序列后面的词将均为PAD,相应位置的掩码设成0 mask = torch.where(y != eos, mask, pad) loss /= num_not_pad_tokens total_loss += loss enc_optimizer.zero_grad() dec_optimizer.zero_grad() loss.backward() enc_optimizer.step() dec_optimizer.step() decoder.eval() encoder.eval() print(f"epoch {epoch+1}, loss = {total_loss/data_set.__len__()}") if epoch % 10 == 0: translate(data_set, random_sample_sentences, sample_in_indices, encoder, decoder, device) translate(data_set, random_sample_sentences, sample_in_indices, encoder, decoder, device)
def main(args): #create a writer writer = SummaryWriter('loss_plot_' + args.mode, comment='test') # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = T.Compose([ T.Resize((224, 224)), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) val_length = len(os.listdir(args.image_dir_val)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) data_loader_val = get_loader(args.image_dir_val, args.caption_path_val, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the model # if no-attention model is chosen: if args.model_type == 'no_attention': encoder = Encoder(args.embed_size).to(device) decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) criterion = nn.CrossEntropyLoss() # if attention model is chosen: elif args.model_type == 'attention': encoder = EncoderAtt(encoded_image_size=9).to(device) decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size, args.attention_dim, args.embed_size, args.dropout_ratio, args.alpha_c).to(device) # if transformer model is chosen: elif args.model_type == 'transformer': model = Transformer(len(vocab), args.embed_size, args.transformer_layers, 8, args.dropout_ratio).to(device) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, model.encoder.parameters()), lr=args.learning_rate_enc) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, model.decoder.parameters()), lr=args.learning_rate_dec) criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>']) else: print('Select model_type attention or no_attention') # if model is not transformer: additional step in encoder is needed: freeze lower layers of resnet if args.fine_tune == True if args.model_type != 'transformer': decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=args.learning_rate_dec) encoder.fine_tune(args.fine_tune) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=args.learning_rate_enc) # initialize lists to store results: loss_train = [] loss_val = [] loss_val_epoch = [] loss_train_epoch = [] bleu_res_list = [] cider_res_list = [] rouge_res_list = [] results = {} # calculate total steps fot train and validation total_step = len(data_loader) total_step_val = len(data_loader_val) #For each epoch for epoch in tqdm(range(args.num_epochs)): loss_val_iter = [] loss_train_iter = [] # set model to train mode if args.model_type != 'transformer': encoder.train() decoder.train() else: model.train() # for each entry in data_loader for i, (images, captions, lengths) in tqdm(enumerate(data_loader)): # load images and captions to device images = images.to(device) captions = captions.to(device) # Forward, backward and optimize # forward and backward path is different dependent of model type: if args.model_type == 'no_attention': # get features from encoder features = encoder(images) # pad targergets to a length targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # get output from decoder outputs = decoder(features, captions, lengths) # calculate loss loss = criterion(outputs, targets) # optimizer and backward step decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() elif args.model_type == 'attention': # get features from encoder features = encoder(images) # get targets - starting from 2 word in captions #(the model not sequantial, so targets are predicted in parallel- no need to predict first word in captions) targets = captions[:, 1:] # decode length = length-1 for each caption decode_lengths = [length - 1 for length in lengths] #flatten targets targets = targets.reshape(targets.shape[0] * targets.shape[1]) sampled_caption = [] # get scores and alphas from decoder scores, alphas = decoder(features, captions, decode_lengths) scores = scores.view(-1, scores.shape[-1]) #predicted = prediction with maximum score _, predicted = torch.max(scores, dim=1) # calculate loss loss = decoder.loss(scores, targets, alphas) # optimizer and backward step decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() elif args.model_type == 'transformer': # input is captions without last word trg_input = captions[:, :-1] # create mask trg_mask = create_masks(trg_input) # get scores from model scores = model(images, trg_input, trg_mask) scores = scores.view(-1, scores.shape[-1]) # get targets - starting from 2 word in captions targets = captions[:, 1:] #predicted = prediction with maximum score _, predicted = torch.max(scores, dim=1) # calculate loss loss = criterion( scores, targets.reshape(targets.shape[0] * targets.shape[1])) #forward and backward path decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() else: print('Select model_type attention or no_attention') # append results to loss lists and writer loss_train_iter.append(loss.item()) loss_train.append(loss.item()) writer.add_scalar('Loss/train/iterations', loss.item(), i + 1) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'. format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) #append mean of last 10 batches as approximate epoch loss loss_train_epoch.append(np.mean(loss_train_iter[-10:])) writer.add_scalar('Loss/train/epoch', np.mean(loss_train_iter[-10:]), epoch + 1) #save model if args.model_type != 'transformer': torch.save( decoder.state_dict(), os.path.join( args.model_path, 'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1))) else: torch.save( model.state_dict(), os.path.join( args.model_path, 'model_' + args.mode + '_{}.ckpt'.format(epoch + 1))) np.save( os.path.join(args.predict_json, 'loss_train_temp_' + args.mode + '.npy'), loss_train) #validate model: # set model to eval mode: if args.model_type != 'transformer': encoder.eval() decoder.eval() else: model.eval() total_step = len(data_loader_val) # set no_grad mode: with torch.no_grad(): # for each entry in data_loader for i, (images, captions, lengths) in tqdm(enumerate(data_loader_val)): targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] images = images.to(device) captions = captions.to(device) # forward and backward path is different dependent of model type: if args.model_type == 'no_attention': features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) elif args.model_type == 'attention': features = encoder(images) sampled_caption = [] targets = captions[:, 1:] decode_lengths = [length - 1 for length in lengths] targets = targets.reshape(targets.shape[0] * targets.shape[1]) scores, alphas = decoder(features, captions, decode_lengths) _, predicted = torch.max(scores, dim=1) scores = scores.view(-1, scores.shape[-1]) sampled_caption = [] loss = decoder.loss(scores, targets, alphas) elif args.model_type == 'transformer': trg_input = captions[:, :-1] trg_mask = create_masks(trg_input) scores = model(images, trg_input, trg_mask) scores = scores.view(-1, scores.shape[-1]) targets = captions[:, 1:] _, predicted = torch.max(scores, dim=1) loss = criterion( scores, targets.reshape(targets.shape[0] * targets.shape[1])) #display results if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step_val, loss.item(), np.exp(loss.item()))) # append results to loss lists and writer loss_val.append(loss.item()) loss_val_iter.append(loss.item()) writer.add_scalar('Loss/validation/iterations', loss.item(), i + 1) np.save( os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'), loss_val) print( 'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step_val, loss.item(), np.exp(loss.item()))) # results: epoch validation loss loss_val_epoch.append(np.mean(loss_val_iter)) writer.add_scalar('Loss/validation/epoch', np.mean(loss_val_epoch), epoch + 1) #predict captions: filenames = os.listdir(args.image_dir_val) predicted = {} for file in tqdm(filenames): if file == '.DS_Store': continue # Prepare an image image = load_image(os.path.join(args.image_dir_val, file), transform) image_tensor = image.to(device) # Generate caption starting with <start> word # procedure is different for each model type if args.model_type == 'attention': features = encoder(image_tensor) sampled_ids, _ = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() #start sampled_caption with <start> sampled_caption = ['<start>'] elif args.model_type == 'no_attention': features = encoder(image_tensor) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = ['<start>'] elif args.model_type == 'transformer': e_outputs = model.encoder(image_tensor) max_seq_length = 20 sampled_ids = torch.zeros(max_seq_length, dtype=torch.long) sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']] ]).to(device) for i in range(1, max_seq_length): trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8') trg_mask = Variable( torch.from_numpy(trg_mask) == 0).to(device) out = model.decoder(sampled_ids[:i].unsqueeze(0), e_outputs, trg_mask) out = model.out(out) out = F.softmax(out, dim=-1) val, ix = out[:, -1].data.topk(1) sampled_ids[i] = ix[0][0] sampled_ids = sampled_ids.cpu().numpy() sampled_caption = [] # Convert word_ids to words for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) # break at <end> of the sentence if word == '<end>': break sentence = ' '.join(sampled_caption) predicted[file] = sentence # save predictions to json file: json.dump( predicted, open( os.path.join( args.predict_json, 'predicted_' + args.mode + '_' + str(epoch) + '.json'), 'w')) #validate model with open(args.caption_path_val, 'r') as file: captions = json.load(file) res = {} for r in predicted: res[r] = [predicted[r].strip('<start> ').strip(' <end>')] images = captions['images'] caps = captions['annotations'] gts = {} for image in images: image_id = image['id'] file_name = image['file_name'] list_cap = [] for cap in caps: if cap['image_id'] == image_id: list_cap.append(cap['caption']) gts[file_name] = list_cap #calculate BLUE, CIDER and ROUGE metrics from real and resulting captions bleu_res = bleu(gts, res) cider_res = cider(gts, res) rouge_res = rouge(gts, res) # append resuls to result lists bleu_res_list.append(bleu_res) cider_res_list.append(cider_res) rouge_res_list.append(rouge_res) # write results to writer writer.add_scalar('BLEU1/validation/epoch', bleu_res[0], epoch + 1) writer.add_scalar('BLEU2/validation/epoch', bleu_res[1], epoch + 1) writer.add_scalar('BLEU3/validation/epoch', bleu_res[2], epoch + 1) writer.add_scalar('BLEU4/validation/epoch', bleu_res[3], epoch + 1) writer.add_scalar('CIDEr/validation/epoch', cider_res, epoch + 1) writer.add_scalar('ROUGE/validation/epoch', rouge_res, epoch + 1) results['bleu'] = bleu_res_list results['cider'] = cider_res_list results['rouge'] = rouge_res_list json.dump( results, open(os.path.join(args.predict_json, 'results_' + args.mode + '.json'), 'w')) np.save( os.path.join(args.predict_json, 'loss_train_' + args.mode + '.npy'), loss_train) np.save(os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'), loss_val)
def train_dynamics(env, args, writer=None): """ Trains the Dynamics module. Supervised. Arguments: env: the initialized environment (rllab/gym) args: input arguments writer: initialized summary writer for tensorboard """ args.action_space = env.action_space # Initialize models enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) if args.from_checkpoint is not None: results_dict = torch.load(args.from_checkpoint) enc.load_state_dict(results_dict['enc']) dec.load_state_dict(results_dict['dec']) d_module.load_state_dict(results_dict['d_module']) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters()) if args.transfer: for p in enc.parameters(): p.requires_grad = False for p in dec.parameters(): p.requires_grad = False all_params = d_module.parameters() optimizer = torch.optim.Adam(all_params, lr=args.lr, weight_decay=args.weight_decay) if args.gpu: enc = enc.cuda() dec = dec.cuda() d_module = d_module.cuda() # Initialize datasets val_loader = None train_dataset = DynamicsDataset(args.train_set, args.train_size, batch=args.train_batch, rollout=args.rollout) val_dataset = DynamicsDataset(args.test_set, 5000, batch=args.test_batch, rollout=args.rollout) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) results_dict = { 'dec_losses': [], 'forward_losses': [], 'inverse_losses': [], 'total_losses': [], 'enc': None, 'dec': None, 'd_module': None, 'd_init': None, 'args': args } total_action_taken = 0 correct_predicted_a_hat = 0 # create the mask here for re-weighting dec_mask = None if args.dec_mask is not None: dec_mask = torch.ones(9) game_vocab = dict([ (b, a) for a, b in enumerate(sorted(env.game.all_possible_features())) ]) dec_mask[game_vocab['Agent']] = args.dec_mask dec_mask[game_vocab['Goal']] = args.dec_mask dec_mask = dec_mask.expand(args.batch_size, args.maze_length, args.maze_length, 9).contiguous().view(-1) dec_mask = Variable(dec_mask, requires_grad=False) if args.gpu: dec_mask = dec_mask.cuda() for epoch in range(1, args.num_epochs + 1): enc.train() dec.train() d_module.train() if args.framework == "mazebase": d_init.train() # for measuring the accuracy train_acc = 0 current_epoch_actions = 0 current_epoch_predicted_a_hat = 0 start = time.time() for i, (states, target_actions) in enumerate(train_loader): optimizer.zero_grad() if args.framework != "mazebase": forward_loss, inv_loss, dec_loss, recon_loss, model_loss, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) else: forward_loss, inv_loss, dec_loss, recon_loss, model_loss, current_epoch_predicted_a_hat, current_epoch_actions = multiple_forward( i, states, target_actions, enc, dec, d_module, args, d_init, dec_mask) loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if i % args.log_interval == 0: log( 'Epoch [{}/{}]\tIter [{}/{}]\t'.format( epoch, args.num_epochs, i+1, len( train_dataset)//args.batch_size) + \ 'Time: {:.2f}\t'.format(time.time() - start) + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0]) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] ) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0]) + \ 'Loss: {:.2f}\t'.format(loss.data[0])) results_dict['dec_losses'].append(dec_loss.data[0]) results_dict['forward_losses'].append(forward_loss.data[0]) results_dict['inverse_losses'].append(inv_loss.data[0]) results_dict['total_losses'].append(loss.data[0]) # write the summaries here if writer: writer.add_scalar('dynamics/total_loss', loss.data[0], epoch) writer.add_scalar('dynamics/decoder', dec_loss.data[0], epoch) writer.add_scalar('dynamics/reconstruction_loss', recon_loss.data[0], epoch) writer.add_scalar('dynamics/next_state_prediction_loss', model_loss.data[0], epoch) writer.add_scalar('dynamics/inv_loss', inv_loss.data[0], epoch) writer.add_scalar('dynamics/forward_loss', forward_loss.data[0], epoch) writer.add_scalars( 'dynamics/all_losses', { "total_loss": loss.data[0], "reconstruction_loss": recon_loss.data[0], "next_state_prediction_loss": model_loss.data[0], "decoder_loss": dec_loss.data[0], "inv_loss": inv_loss.data[0], "forward_loss": forward_loss.data[0], }, epoch) loss.backward() correct_predicted_a_hat += current_epoch_predicted_a_hat total_action_taken += current_epoch_actions # does it not work at all without grad clipping ? torch.nn.utils.clip_grad_norm(all_params, args.max_grad_norm) optimizer.step() # maybe add the generated image to add the logs # writer.add_image() # Run validation if val_loader is not None: enc.eval() dec.eval() d_module.eval() forward_loss, inv_loss, dec_loss = 0, 0, 0 for i, (states, target_actions) in enumerate(val_loader): f_loss, i_loss, d_loss, _, _, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) forward_loss += f_loss inv_loss += i_loss dec_loss += d_loss loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if writer: writer.add_scalar('val/forward_loss', forward_loss.data[0] / i, epoch) writer.add_scalar('val/inverse_loss', inv_loss.data[0] / i, epoch) writer.add_scalar('val/decoder_loss', dec_loss.data[0] / i, epoch) log( '[Validation]\t' + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0] / i) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] / i) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0] / i) + \ 'Loss: {:.2f}\t'.format(loss.data[0] / i)) if epoch % args.checkpoint == 0: results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() if args.framework == "mazebase": results_dict['d_init'] = d_init.state_dict() torch.save( results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) log('Saved model %s' % epoch) results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() torch.save(results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) print(os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
def train(config): train_config = config['train'] global device device = train_config['device'] if not torch.cuda.is_available(): device = 'cpu' tqdm.write('Training on {}'.format(device)) writer = SummaryWriter('log') train_dataset, test_dataset = create_datasets(**config['dataset']) train_dataloader = DataLoader(train_dataset, batch_size=train_config['batch_size'], shuffle=True, collate_fn=collate_fn) test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch_size'], shuffle=False, collate_fn=collate_fn) encoder = Encoder(vocab_size=len(train_dataset.lang1), **config['encoder'], device=device).to(device) decoder = Decoder(vocab_size=len(train_dataset.lang2), **config['decoder']).to(device) encoder_optimizer = optim.Adam(encoder.parameters(), lr=train_config['lr']) decoder_optimizer = optim.Adam(decoder.parameters(), lr=train_config['lr']) criterion = nn.NLLLoss() tqdm.write('[-] Start training! ') epoch_bar = tqdm(range(train_config['n_epochs']), desc='[Total progress]', leave=True, position=0, dynamic_ncols=True) for epoch in epoch_bar: batch_bar = tqdm(range(len(train_dataloader)), desc='[Train epoch {:2}]'.format(epoch), leave=True, position=0, dynamic_ncols=True) encoder.train() decoder.train() train_loss = 0 for batch in batch_bar: (source, target_bos, target_eos) = next(iter(train_dataloader)) encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() source, target_bos, target_eos = source.to(device), target_bos.to( device), target_eos.to(device) encoder_output, encoder_hidden = encoder(source) decoder_output = decoder(target_bos, encoder_hidden) loss = criterion(decoder_output.view(-1, decoder_output.size(-1)), target_eos.view(-1)) train_loss += loss.item() n_hit, n_total = hitRate(decoder_output, target_eos) loss.backward() #print(loss.item()) encoder_optimizer.step() decoder_optimizer.step() batch_bar.set_description( '[Train epoch {:2} | Loss: {:.2f} | Hit: {}/{}]'.format( epoch, loss, n_hit, n_total)) train_loss /= len(train_dataloader) batch_bar = tqdm(range(len(test_dataloader)), desc='[Test epoch {:2}]'.format(epoch), leave=True, position=0, dynamic_ncols=True) encoder.eval() decoder.eval() test_loss = 0 for batch in batch_bar: (source, target_bos, target_eos) = next(iter(test_dataloader)) source, target_bos, target_eos = source.to(device), target_bos.to( device), target_eos.to(device) with torch.no_grad(): encoder_output, encoder_hidden = encoder(source) decoder_output = decoder(target_bos, encoder_hidden) loss = criterion( decoder_output.view(-1, decoder_output.size(-1)), target_eos.view(-1)) test_loss += loss.item() n_hit, n_total = hitRate(decoder_output, target_eos) batch_bar.set_description( '[Test epoch {:2} | Loss: {:.2f} | Hit: {}/{}]'.format( epoch, loss, n_hit, n_total)) test_loss /= len(test_dataloader) writer.add_scalars('Loss', { 'train': train_loss, 'test': test_loss }, epoch) sample(test_dataset, encoder, decoder) tqdm.write('[-] Done!')
class Solver(object): def __init__(self, hps, data_loader, log_dir='./log/'): self.hps = hps self.data_loader = data_loader self.model_kept = [] self.max_keep = 20 self.build_model() self.logger = Logger(log_dir) def build_model(self): hps = self.hps ns = self.hps.ns emb_size = self.hps.emb_size self.Encoder = Encoder(ns=ns, dp=hps.enc_dp) self.Decoder = Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size) self.Generator = Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size) self.LatentDiscriminator = LatentDiscriminator(ns=ns, dp=hps.dis_dp) self.PatchDiscriminator = PatchDiscriminator(ns=ns, n_class=hps.n_speakers) if torch.cuda.is_available(): self.Encoder.cuda() self.Decoder.cuda() self.Generator.cuda() self.LatentDiscriminator.cuda() self.PatchDiscriminator.cuda() betas = (0.5, 0.9) params = list(self.Encoder.parameters()) + list( self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.lat_opt = optim.Adam(self.LatentDiscriminator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas) def save_model(self, model_path, iteration, enc_only=True): if not enc_only: all_model = { 'encoder': self.Encoder.state_dict(), 'decoder': self.Decoder.state_dict(), 'generator': self.Generator.state_dict(), 'latent_discriminator': self.LatentDiscriminator.state_dict(), 'patch_discriminator': self.PatchDiscriminator.state_dict(), } else: all_model = { 'encoder': self.Encoder.state_dict(), 'decoder': self.Decoder.state_dict(), 'generator': self.Generator.state_dict(), } new_model_path = '{}-{}'.format(model_path, iteration) with open(new_model_path, 'wb') as f_out: torch.save(all_model, f_out) self.model_kept.append(new_model_path) if len(self.model_kept) >= self.max_keep: os.remove(self.model_kept[0]) self.model_kept.pop(0) def load_model(self, model_path, enc_only=True): print('load model from {}'.format(model_path)) with open(model_path, 'rb') as f_in: all_model = torch.load(f_in) self.Encoder.load_state_dict(all_model['encoder']) self.Decoder.load_state_dict(all_model['decoder']) #self.Genrator.load_state_dict(all_model['generator']) if not enc_only: self.LatentDiscriminator.load_state_dict( all_model['latent_discriminator']) self.PatchDiscriminator.load_state_dict( all_model['patch_discriminator']) def set_eval(self): self.Encoder.eval() self.Decoder.eval() self.Generator.eval() #self.LatentDiscriminator.eval() def test_step(self, x, c): self.set_eval() x = to_var(x).permute(0, 2, 1) enc = self.Encoder(x) x_tilde = self.Decoder(enc, c) return x_tilde.data.cpu().numpy() def permute_data(self, data): C = [to_var(c, requires_grad=False) for c in data[:2]] X = [to_var(x).permute(0, 2, 1) for x in data[2:]] return C, X def sample_c(self, size): c_sample = Variable(torch.multinomial(torch.ones(8), num_samples=size, replacement=True), requires_grad=False) c_sample = c_sample.cuda() if torch.cuda.is_available() else c_sample return c_sample def cal_acc(self, logits, y_true): _, ind = torch.max(logits, dim=1) acc = torch.sum( (ind == y_true).type(torch.FloatTensor)) / y_true.size(0) return acc def encode_step(self, *args): enc_list = [] for x in args: enc = self.Encoder(x) enc_list.append(enc) return tuple(enc_list) def decode_step(self, enc, c): x_tilde = self.Decoder(enc, c) return x_tilde def latent_discriminate_step(self, enc_i_t, enc_i_tk, enc_i_prime, enc_j, is_dis=True): same_pair = torch.cat([enc_i_t, enc_i_tk], dim=1) diff_pair = torch.cat([enc_i_prime, enc_j], dim=1) if is_dis: same_val = self.LatentDiscriminator(same_pair) diff_val = self.LatentDiscriminator(diff_pair) w_dis = torch.mean(same_val - diff_val) gp = calculate_gradients_penalty(self.LatentDiscriminator, same_pair, diff_pair) return w_dis, gp else: diff_val = self.LatentDiscriminator(diff_pair) loss_adv = -torch.mean(diff_val) return loss_adv def patch_discriminate_step(self, x, x_tilde, cal_gp=True): # w-distance D_real, real_logits = self.PatchDiscriminator(x, classify=True) D_fake, fake_logits = self.PatchDiscriminator(x_tilde, classify=True) w_dis = torch.mean(D_real - D_fake) if cal_gp: gp = calculate_gradients_penalty(self.PatchDiscriminator, x, x_tilde) return w_dis, real_logits, fake_logits, gp else: return w_dis, real_logits, fake_logits # backup #def classify(): # # aux clssify loss # criterion = nn.NLLLoss() # c_loss = criterion(real_logits, c) + criterion(fake_logits, c_sample) # real_acc = self.cal_acc(real_logits, c) # fake_acc = self.cal_acc(fake_logits, c_sample) def train(self, model_path, flag='train'): # load hyperparams hps = self.hps for iteration in range(hps.iters): # calculate current alpha if iteration + 1 < hps.lat_sched_iters and iteration >= hps.enc_pretrain_iters: current_alpha = hps.alpha_enc * ( iteration + 1 - hps.enc_pretrain_iters) / ( hps.lat_sched_iters - hps.enc_pretrain_iters) else: current_alpha = 0 if iteration >= hps.enc_pretrain_iters: n_latent_steps = hps.n_latent_steps \ if iteration > hps.enc_pretrain_iters else hps.dis_pretrain_iters for step in range(n_latent_steps): #===================== Train latent discriminator =====================# data = next(self.data_loader) (c_i, c_j), (x_i_t, x_i_tk, x_i_prime, x_j) = self.permute_data(data) # encode enc_i_t, enc_i_tk, enc_i_prime, enc_j = self.encode_step( x_i_t, x_i_tk, x_i_prime, x_j) # latent discriminate latent_w_dis, latent_gp = self.latent_discriminate_step( enc_i_t, enc_i_tk, enc_i_prime, enc_j) lat_loss = -hps.alpha_dis * latent_w_dis + hps.lambda_ * latent_gp reset_grad([self.LatentDiscriminator]) lat_loss.backward() grad_clip([self.LatentDiscriminator], self.hps.max_grad_norm) self.lat_opt.step() # print info info = { f'{flag}/D_latent_w_dis': latent_w_dis.data[0], f'{flag}/latent_gp': latent_gp.data[0], } slot_value = (step, iteration + 1, hps.iters) + \ tuple([value for value in info.values()]) log = 'lat_D-%d:[%06d/%06d], w_dis=%.3f, gp=%.2f' print(log % slot_value) for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration) # two stage training if iteration >= hps.patch_start_iter: for step in range(hps.n_patch_steps): #===================== Train patch discriminator =====================# data = next(self.data_loader) (c_i, _), (x_i_t, _, _, _) = self.permute_data(data) # encode enc_i_t, = self.encode_step(x_i_t) c_sample = self.sample_c(x_i_t.size(0)) x_tilde = self.decode_step(enc_i_t, c_i) # Aux classify loss patch_w_dis, real_logits, fake_logits, patch_gp = \ self.patch_discriminate_step(x_i_t, x_tilde, cal_gp=True) patch_loss = -hps.beta_dis * patch_w_dis + hps.lambda_ * patch_gp + hps.beta_clf * c_loss reset_grad([self.PatchDiscriminator]) patch_loss.backward() grad_clip([self.PatchDiscriminator], self.hps.max_grad_norm) self.patch_opt.step() # print info info = { f'{flag}/D_patch_w_dis': patch_w_dis.data[0], f'{flag}/patch_gp': patch_gp.data[0], f'{flag}/c_loss': c_loss.data[0], f'{flag}/real_acc': real_acc, f'{flag}/fake_acc': fake_acc, } slot_value = (step, iteration + 1, hps.iters) + \ tuple([value for value in info.values()]) log = 'patch_D-%d:[%06d/%06d], w_dis=%.3f, gp=%.2f, c_loss=%.3f, real_acc=%.2f, fake_acc=%.2f' print(log % slot_value) for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration) #===================== Train G =====================# data = next(self.data_loader) (c_i, c_j), (x_i_t, x_i_tk, x_i_prime, x_j) = self.permute_data(data) # encode enc_i_t, enc_i_tk, enc_i_prime, enc_j = self.encode_step( x_i_t, x_i_tk, x_i_prime, x_j) # decode x_tilde = self.decode_step(enc_i_t, c_i) loss_rec = torch.mean(torch.abs(x_tilde - x_i_t)) # latent discriminate loss_adv = self.latent_discriminate_step(enc_i_t, enc_i_tk, enc_i_prime, enc_j, is_dis=False) ae_loss = loss_rec + current_alpha * loss_adv reset_grad([self.Encoder, self.Decoder]) retain_graph = True if hps.n_patch_steps > 0 else False ae_loss.backward(retain_graph=retain_graph) grad_clip([self.Encoder, self.Decoder], self.hps.max_grad_norm) self.ae_opt.step() info = { f'{flag}/loss_rec': loss_rec.data[0], f'{flag}/loss_adv': loss_adv.data[0], f'{flag}/alpha': current_alpha, } slot_value = (iteration + 1, hps.iters) + tuple( [value for value in info.values()]) log = 'G:[%06d/%06d], loss_rec=%.2f, loss_adv=%.2f, alpha=%.2e' print(log % slot_value) for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) # patch discriminate if hps.n_patch_steps > 0 and iteration >= hps.patch_start_iter: c_sample = self.sample_c(x_i_t.size(0)) x_tilde = self.decode_step(enc_i_t, c_sample) patch_w_dis, real_logits, fake_logits = \ self.patch_discriminate_step(x_i_t, x_tilde, cal_gp=False) patch_loss = hps.beta_dec * patch_w_dis + hps.beta_clf * c_loss reset_grad([self.Decoder]) patch_loss.backward() grad_clip([self.Decoder], self.hps.max_grad_norm) self.decoder_opt.step() info = { f'{flag}/G_patch_w_dis': patch_w_dis.data[0], f'{flag}/c_loss': c_loss.data[0], f'{flag}/real_acc': real_acc, f'{flag}/fake_acc': fake_acc, } slot_value = (iteration + 1, hps.iters) + tuple( [value for value in info.values()]) log = 'G:[%06d/%06d]: patch_w_dis=%.2f, c_loss=%.2f, real_acc=%.2f, fake_acc=%.2f' print(log % slot_value) for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) if iteration % 1000 == 0 or iteration + 1 == hps.iters: self.save_model(model_path, iteration)
import torch from torchvision import transforms, datasets, models from torchvision.utils import save_image from model import Encoder, Decoder input_nc = 1 output_nc = 1 enc = Encoder(input_nc, output_nc) dec = Decoder(input_nc, output_nc) enc.load_state_dict(torch.load("pretrained/enc.pth")) dec.load_state_dict(torch.load("pretrained/dec.pth")) enc.eval() dec.eval() transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5])]) dataset = datasets.MNIST(root='./data', transform=transform, download=True, train=False) dataset_loader = torch.utils.data.DataLoader( dataset, batch_size=96, shuffle=True, ) for i, (image, _) in enumerate(dataset_loader): encoded = enc(image)
def main(args): # constant definition sos_idx = 0 eos_idx = 1 pad_idx = 2 a_dim = 512 h_dim = 512 attn_dim = 512 embed_dim = 512 regularize_constant = 1. # lambda * L => lambda = 1/L vocabulary = torch.load(args.voca_path) vocab_size = len(vocabulary) device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu") encoder = Encoder().to(device) decoder = Decoder(a_dim, h_dim, attn_dim, vocab_size, embed_dim).to(device) # We do not train the encoder encoder.eval() if not args.test: # train validation_term = 1 best_bleu = 0. num_of_epochs_since_improvement = 0 early_stop_criterion = 20 train_loader = get_train_data_loader(args.path, args.token_path, args.voca_path, args.batch_size, pad_idx) valid_loader = get_test_data_loader(args.path, args.token_path, args.voca_path, args.batch_size, pad_idx, dataset_type='valid') criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001) print('Start training ...') for epoch in range(args.epochs): # early stopping if num_of_epochs_since_improvement > early_stop_criterion: print("There's no improvement on BLEU score while %d epochs" % (num_of_epochs_since_improvement)) print("Stop Training") break start_epoch = time.time() i = 0 ############################################################################################################################################ # training decoder.train() for src_batch, trg_batch in train_loader: batch_start = time.time() src_batch = src_batch.to(device) trg_batch = torch.tensor(trg_batch).to(device) trg_input = trg_batch[:, :-1] trg_output = trg_batch[:, 1:].contiguous().view(-1) a = encoder(src_batch) preds, alphas = decoder( a, trg_input) # [batch, C, vocab_size], [batch, C, L] optimizer.zero_grad() loss = criterion(preds.view(-1, preds.size(-1)), trg_output) # NLL loss regularize_term = regularize_constant * ( (1. - torch.sum(alphas, dim=1))**2).mean() total_loss = loss + regularize_term total_loss.backward() optimizer.step() i = i + 1 # flush the GPU cache if torch.cuda.is_available(): torch.cuda.empty_cache() batch_time = time.time() - batch_start print( '[%d/%d][%d/%d] train loss : %.4f (%.4f / %.4f) | time : %.2fs' % (epoch + 1, args.epochs, i, train_loader.size // args.batch_size + 1, total_loss.item(), loss.item(), regularize_term.item(), batch_time)) epoch_time = time.time() - start_epoch print('Time taken for %d epoch : %.2fs' % (epoch + 1, epoch_time)) ############################################################################################################################################ # validation if i % validation_term == 0: decoder.eval() j = 0 pred, ref = [], [] for src_batch, trg_batch in valid_loader: start = time.time() batch_size = src_batch.size(0) src_batch = src_batch.to(device) # [batch, 3, 244, 244] trg_batch = torch.tensor(trg_batch).to( device) # [batch * 5, C] trg_batch = torch.split(trg_batch, 5) batches = [] for k in range(batch_size): batches.append(trg_batch[k].unsqueeze(0)) trg_batch = torch.cat(batches, dim=0) # [batch, 5, C] max_length = trg_batch.size(-1) pred_batch = torch.zeros(batch_size, 1, dtype=int).to( device) # [batch, 1] = [[0],[0],...,[0]] # eos_mask[i] = 1 means i-th sentence has eos eos_mask = torch.zeros(batch_size, dtype=int) a = encoder(src_batch) for _ in range(max_length): output, _ = decoder( a, pred_batch) # [batch, _+1, vocab_size] # greedy search output = torch.argmax(F.softmax(output, dim=-1), dim=-1) # [batch_size, _+1] predictions = output[:, -1].unsqueeze(1) pred_batch = torch.cat([pred_batch, predictions], dim=-1) for l in range(batch_size): if predictions[l] == eos_idx: eos_mask[l] = 1 # every sentence has eos if eos_mask.sum() == batch_size: break # flush the GPU cache if torch.cuda.is_available(): torch.cuda.empty_cache() pred += seq2sen(pred_batch.cpu().numpy().tolist(), vocabulary) for m in range(batch_size): ref += [ seq2sen(trg_batch[m].cpu().numpy().tolist(), vocabulary) ] t = time.time() - start j += 1 print("[%d/%d] prediction done | time : %.2fs" % (j, valid_loader.size // args.batch_size + 1, t)) bleu_1 = corpus_bleu(ref, pred, weights=(1. / 1., )) * 100 bleu_2 = corpus_bleu(ref, pred, weights=( 1. / 2., 1. / 2., )) * 100 bleu_3 = corpus_bleu( ref, pred, weights=( 1. / 3., 1. / 3., 1. / 3., )) * 100 bleu_4 = corpus_bleu( ref, pred, weights=( 1. / 4., 1. / 4., 1. / 4., 1. / 4., )) * 100 print(f'BLEU-1: {bleu_1:.2f}') print(f'BLEU-2: {bleu_2:.2f}') print(f'BLEU-3: {bleu_3:.2f}') print(f'BLEU-4: {bleu_4:.2f}') if bleu_1 > best_bleu: num_of_epochs_since_improvement = 0 best_bleu = bleu_1 print('Best BLEU-1 has been updated : %.2f' % (best_bleu)) save_checkpoint(decoder, 'checkpoints/best') else: num_of_epochs_since_improvement += validation_term print( "There's no improvement on BLEU score while %d epochs" % (num_of_epochs_since_improvement)) ################################################################################################################################################################ print('End of the training') else: if os.path.exists(args.checkpoint): decoder_checkpoint = torch.load(args.checkpoint) decoder.load_state_dict(decoder_checkpoint['state_dict']) print("trained decoder " + args.checkpoint + " is loaded") decoder.eval() # test test_loader = get_test_data_loader(args.path, args.token_path, args.voca_path, args.batch_size, pad_idx) j = 0 pred, ref = [], [] for src_batch, trg_batch in test_loader: # predict pred_batch from src_batch with your model. # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1). # every <pad> token (index: 2) should be located after <eos> token (index: 1). # example of pred_batch: # [[0, 5, 6, 7, 1], # [0, 4, 9, 1, 2], # [0, 6, 1, 2, 2]] start = time.time() batch_size = src_batch.size(0) src_batch = src_batch.to(device) # [batch, 3, 244, 244] trg_batch = torch.tensor(trg_batch).to(device) # [batch * 5, C] trg_batch = torch.split(trg_batch, 5) batches = [] for k in range(batch_size): batches.append(trg_batch[k].unsqueeze(0)) trg_batch = torch.cat(batches, dim=0) # [batch, 5, C] max_length = trg_batch.size(-1) pred_batch = torch.zeros(batch_size, 1, dtype=int).to( device) # [batch, 1] = [[0],[0],...,[0]] # eos_mask[i] = 1 means i-th sentence has eos eos_mask = torch.zeros(batch_size, dtype=int) a = encoder(src_batch) for _ in range(max_length): output, _ = decoder(a, pred_batch) # [batch, _+1, vocab_size] # greedy search output = torch.argmax(F.softmax(output, dim=-1), dim=-1) # [batch_size, _+1] predictions = output[:, -1].unsqueeze(1) pred_batch = torch.cat([pred_batch, predictions], dim=-1) for l in range(batch_size): if predictions[l] == eos_idx: eos_mask[l] = 1 # every sentence has eos if eos_mask.sum() == batch_size: break # flush the GPU cache if torch.cuda.is_available(): torch.cuda.empty_cache() pred += seq2sen(pred_batch.cpu().numpy().tolist(), vocabulary) for m in range(batch_size): ref += [ seq2sen(trg_batch[m].cpu().numpy().tolist(), vocabulary) ] t = time.time() - start j += 1 print("[%d/%d] prediction done | time : %.2fs" % (j, test_loader.size // args.batch_size + 1, t)) bleu_1 = corpus_bleu(ref, pred, weights=(1. / 1., )) * 100 bleu_2 = corpus_bleu(ref, pred, weights=( 1. / 2., 1. / 2., )) * 100 bleu_3 = corpus_bleu(ref, pred, weights=( 1. / 3., 1. / 3., 1. / 3., )) * 100 bleu_4 = corpus_bleu( ref, pred, weights=( 1. / 4., 1. / 4., 1. / 4., 1. / 4., )) * 100 print(f'BLEU-1: {bleu_1:.2f}') print(f'BLEU-2: {bleu_2:.2f}') print(f'BLEU-3: {bleu_3:.2f}') print(f'BLEU-4: {bleu_4:.2f}') with open('results/pred.txt', 'w') as f: for line in pred: f.write('{}\n'.format(line)) with open('results/ref.txt', 'w') as f: for lines in ref: for line in lines: f.write('{}\n'.format(line)) f.write('_' * 50 + '\n')
def main(): # Praise argparser! parser = argparse.ArgumentParser( description= "Inference script for performing joint tasks on ATIS datasets.") parser.add_argument("--train_path", type=str, help="path of train dataset.") parser.add_argument("--test_path", type=str, help="path of test dataset.") parser.add_argument("--model_dir", type=str, default="./models/", help='path for saved trained models.') parser.add_argument('--max_length', type=int, default=60, help='max sequence length') parser.add_argument('--embedding_size', type=int, default=100, help='dimension of word embedding vectors') parser.add_argument('--hidden_size', type=int, default=50, help='dimension of lstm hidden states') args = parser.parse_args() # Load data print("Loading data...") _, word2index, tag2index, intent2index = preprocessing( args.train_path, args.max_length) index2tag = {v: k for k, v in tag2index.items()} index2intent = {v: k for k, v in intent2index.items()} # Load model print("Loading model...") encoder = Encoder(len(word2index), args.embedding_size, args.hidden_size) decoder = Decoder(len(tag2index), len(intent2index), len(tag2index) // 3, args.hidden_size * 2) encoder.load_state_dict( torch.load(os.path.join(args.model_dir, 'jointnlu-encoder.pkl'), map_location=None if USE_CUDA else "cpu")) decoder.load_state_dict( torch.load(os.path.join(args.model_dir, 'jointnlu-decoder.pkl'), map_location=None if USE_CUDA else "cpu")) if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() # Switch to evaluation mode encoder.eval() decoder.eval() # Preprocess test data test = open(args.test_path, "r").readlines() test = [t[:-1] for t in test] test = [[ t.split("\t")[0].split(" "), t.split("\t")[1].split(" ")[:-1], t.split("\t")[1].split(" ")[-1] ] for t in test] test = [ [t[0][1:-1], t[1][1:], t[2].split("#")[0]] for t in test ] # Note here I split embedded multiple labels into separate labels and get the first one. # This could lower error rate. slot_f1 = [] intent_err = [] # Test cases. for index in range(len(test)): test_raw = test[index][0] test_in = prepare_sequence(test_raw, word2index).to("cpu") test_mask = Variable( torch.BoolTensor(tuple(map( lambda s: s == 0, test_in.data)))).cuda() if USE_CUDA else Variable( torch.BoolTensor(tuple(map(lambda s: s == 0, test_in.data)))).view(1, -1) if USE_CUDA: start_decode = Variable( torch.LongTensor([[word2index['<SOS>']] * 1 ])).cuda().transpose(1, 0) else: start_decode = Variable( torch.LongTensor([[word2index['<SOS>']] * 1])).transpose(1, 0) output, hidden_c = encoder(test_in.unsqueeze(0), test_mask.unsqueeze(0)) tag_score, intent_score = decoder(start_decode, hidden_c, output, test_mask) v, i = torch.max(tag_score, 1) slot_pred = list(map(lambda ii: index2tag[ii], i.data.tolist())) slot_gt = test[index][1] # Calculate f1_micro with sklearn. Pretty handy. slot_f1.append(f1_score(slot_gt, slot_pred, average="micro")) v, i = torch.max(intent_score, 1) intent_pred = index2intent[i.data.tolist()[0]] intent_gt = test[index][2] if intent_pred != intent_gt: intent_err.append([test[index][0], intent_gt, intent_pred]) # Print our results. print("Input Sentence\t: ", *test[index][0]) print("Truth\t\t: ", *slot_gt) print("Prediction\t: ", *slot_pred) print("Truth\t\t: ", intent_gt) print("Prediction\t: ", intent_pred) print() # Print out everything I need to finish my report. # print("Got slot err ", len(slot_err[0])) # print(*slot_err, sep="\n") print("Got intent err ", len(intent_err)) print("--- BEGIN ERR PRINT ---") for case in intent_err: print("Input : ", *case[0]) print("Truth : ", case[1]) print("Predict: ", case[2]) print() print("--- ENDOF ERR PRINT ---") print("Total ", len(test)) print("Slot f1_micro avg %f" % np.average(slot_f1)) print("Intent acc %f" % (1 - len(intent_err) / len(test)))
def main(args): device = Config.device print("PyTorch running with device {0}".format(device)) if args.download: print("Downloading data") download_required_data() if args.lemmatize: caption_file = 'data/Flickr_Data/Flickr_TextData/Flickr8k.lemma.token.txt' else: caption_file = 'data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt' print("Generating word2id") word2id = generate_word2id(caption_file) id2word = dict([(v, k) for k, v in word2id.items()]) print("Loading Encoder and Decoder") encoder = Encoder(Config.encoded_size, Config.encoder_finetune) decoder = Decoder(Config.encoder_dim, Config.decoder_dim, Config.attention_dim, Config.embed_dim, vocab_size=len(word2id), dropout=Config.dropout, embedding_finetune=Config.embedding_finetune) if args.model_path: print("Loading model from model_path") load_model(encoder, decoder, args.model_path) else: # no model path, so load pretrained embedding print("Generating embedding from pretrained embedding file") embedding = load_pretrained_embedding( 'data/glove.6B.{}d.txt'.format(Config.embed_dim), word2id, Config.embed_dim) decoder.load_embedding(embedding) if not args.test: # train print("Loading DataLoader and Trainer") dloader = DataLoader(caption_file, 'data/Flickr_Data/Images') trainer = Trainer(encoder, decoder, dloader) print("Start Training") loss_history = trainer.train(Config.num_epochs) plt.plot(np.arange(len(loss_history)), loss_history, label='Loss') plt.legend() plt.show() else: # test assert args.image_path encoder.eval() decoder.eval() transform = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor() ]) image = transform(Image.open(args.image_path)) image = image.unsqueeze(0) # TODO # generate caption from an image encoder_output = encoder(image) captions, alphas = decoder.generate_caption_greedily(encoder_output) caption_in_word = ' '.join(list(map(id2word.get, captions[1:]))) plt.imshow(image[0].numpy().transpose(1, 2, 0)) plt.title(caption_in_word) plt.axis('off') plt.show() print(caption_in_word)
def main(args): src, tgt = load_data(args.path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) src_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') src_vocab.load(os.path.join(args.path, 'vocab.en')) tgt_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') tgt_vocab.load(os.path.join(args.path, 'vocab.de')) sos_idx = 0 eos_idx = 1 pad_idx = 2 max_length = 50 src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) N = 6 dim = 512 # MODEL Construction encoder = Encoder(N, dim, pad_idx, src_vocab_size, device).to(device) decoder = Decoder(N, dim, pad_idx, tgt_vocab_size, device).to(device) if args.model_load: ckpt = torch.load("drive/My Drive/checkpoint/best.ckpt") encoder.load_state_dict(ckpt["encoder"]) decoder.load_state_dict(ckpt["decoder"]) params = list(encoder.parameters()) + list(decoder.parameters()) if not args.test: train_loader = get_loader(src['train'], tgt['train'], src_vocab, tgt_vocab, batch_size=args.batch_size, shuffle=True) valid_loader = get_loader(src['valid'], tgt['valid'], src_vocab, tgt_vocab, batch_size=args.batch_size) warmup = 4000 steps = 1 lr = 1. * (dim**-0.5) * min(steps**-0.5, steps * (warmup**-1.5)) optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.98), eps=1e-09) train_losses = [] val_losses = [] latest = 1e08 # to store latest checkpoint start_epoch = 0 if (args.model_load): start_epoch = ckpt["epoch"] optimizer.load_state_dict(ckpt["optim"]) steps = start_epoch * 30 for epoch in range(start_epoch, args.epochs): for src_batch, tgt_batch in train_loader: encoder.train() decoder.train() optimizer.zero_grad() tgt_batch = torch.LongTensor(tgt_batch) src_batch = Variable(torch.LongTensor(src_batch)).to(device) gt = Variable(tgt_batch[:, 1:]).to(device) tgt_batch = Variable(tgt_batch[:, :-1]).to(device) enc_output, seq_mask = encoder(src_batch) dec_output = decoder(tgt_batch, enc_output, seq_mask) gt = gt.view(-1) dec_output = dec_output.view(gt.size()[0], -1) loss = F.cross_entropy(dec_output, gt, ignore_index=pad_idx) loss.backward() train_losses.append(loss.item()) optimizer.step() steps += 1 lr = (dim**-0.5) * min(steps**-0.5, steps * (warmup**-1.5)) update_lr(optimizer, lr) if (steps % 10 == 0): print("loss : %f" % loss.item()) for src_batch, tgt_batch in valid_loader: encoder.eval() decoder.eval() src_batch = Variable(torch.LongTensor(src_batch)).to(device) tgt_batch = torch.LongTensor(tgt_batch) gt = Variable(tgt_batch[:, 1:]).to(device) tgt_batch = Variable(tgt_batch[:, :-1]).to(device) enc_output, seq_mask = encoder(src_batch) dec_output = decoder(tgt_batch, enc_output, seq_mask) gt = gt.view(-1) dec_output = dec_output.view(gt.size()[0], -1) loss = F.cross_entropy(dec_output, gt, ignore_index=pad_idx) val_losses.append(loss.item()) print("[EPOCH %d] Loss %f" % (epoch, loss.item())) if (val_losses[-1] <= latest): checkpoint = {'encoder':encoder.state_dict(), 'decoder':decoder.state_dict(), \ 'optim':optimizer.state_dict(), 'epoch':epoch} torch.save(checkpoint, "drive/My Drive/checkpoint/best.ckpt") latest = val_losses[-1] if (epoch % 20 == 0): plt.figure() plt.plot(val_losses) plt.xlabel("epoch") plt.ylabel("model loss") plt.show() else: # test test_loader = get_loader(src['test'], tgt['test'], src_vocab, tgt_vocab, batch_size=args.batch_size) # LOAD CHECKPOINT pred = [] for src_batch, tgt_batch in test_loader: encoder.eval() decoder.eval() b_s = min(args.batch_size, len(src_batch)) tgt_batch = torch.zeros(b_s, 1).to(device).long() src_batch = Variable(torch.LongTensor(src_batch)).to(device) enc_output, seq_mask = encoder(src_batch) pred_batch = decoder(tgt_batch, enc_output, seq_mask) _, pred_batch = torch.max(pred_batch, 2) while (not is_finished(pred_batch, max_length, eos_idx)): # do something next_input = torch.cat((tgt_batch, pred_batch.long()), 1) pred_batch = decoder(next_input, enc_output, seq_mask) _, pred_batch = torch.max(pred_batch, 2) # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1). # every <pad> token (index: 2) should be located after <eos> token (index: 1). # example of pred_batch: # [[0, 5, 6, 7, 1], # [0, 4, 9, 1, 2], # [0, 6, 1, 2, 2]] pred_batch = pred_batch.tolist() for line in pred_batch: line[-1] = 1 pred += seq2sen(pred_batch, tgt_vocab) # print(pred) with open('results/pred.txt', 'w') as f: for line in pred: f.write('{}\n'.format(line)) os.system( 'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
def predict(region): np.random.seed(0) torch.manual_seed(0) input_len = 10 encoder_units = 32 decoder_units = 64 encoder_rnn_layers = 3 encoder_dropout = 0.2 decoder_dropout = 0.2 input_size = 2 output_size = 1 predict_len = 5 batch_size = 16 force_teacher = 0.8 train_dataset, test_dataset, train_max, train_min = create_dataset( input_len, predict_len, region) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) enc = Encoder(input_size, encoder_units, input_len, encoder_rnn_layers, encoder_dropout) dec = Decoder(encoder_units * 2, decoder_units, input_len, input_len, decoder_dropout, output_size) enc.load_state_dict(torch.load(f"models/{region}_enc.pth")) dec.load_state_dict(torch.load(f"models/{region}_dec.pth")) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False) rmse = 0 p = 4 predicted = [] true_target = [] enc.eval() dec.eval() for encoder_input, decoder_input, target in test_loader: with torch.no_grad(): enc_vec = enc(encoder_input) x = decoder_input[:, 0] h, c = dec.initHidden(1) pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) pred += [x] pred = torch.cat(pred, dim=1) predicted += [pred[0, p].item()] true_target += [target[0, 0].item()] predicted = np.array(predicted).reshape(1, -1) predicted = (predicted + train_min) * (train_max - train_min) true_target = np.array(true_target).reshape(1, -1) true_target = (true_target + train_min) * (train_max - train_min) rmse, peasonr = calc_metric(predicted, true_target) print(f"{region} RMSE {rmse}") print(f"{region} r {peasonr[0]}") predicted = predicted.reshape(-1) true_target = true_target.reshape(-1) x = list(range(len(predicted))) plt.plot(x, predicted) plt.plot(x, true_target) plt.show() return f"{region} RMSE {rmse} r {peasonr[0]}"
def train(region): np.random.seed(0) torch.manual_seed(0) input_len = 10 encoder_units = 32 decoder_units = 64 encoder_rnn_layers = 3 encoder_dropout = 0.2 decoder_dropout = 0.2 input_size = 2 output_size = 1 predict_len = 5 batch_size = 16 epochs = 500 force_teacher = 0.8 train_dataset, test_dataset, train_max, train_min = create_dataset( input_len, predict_len, region) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) test_loader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) enc = Encoder(input_size, encoder_units, input_len, encoder_rnn_layers, encoder_dropout) dec = Decoder(encoder_units*2, decoder_units, input_len, input_len, decoder_dropout, output_size) optimizer = AdaBound(list(enc.parameters()) + list(dec.parameters()), 0.01, final_lr=0.1) # optimizer = optim.Adam(list(enc.parameters()) + list(dec.parameters()), 0.01) criterion = nn.MSELoss() mb = master_bar(range(epochs)) for ep in mb: train_loss = 0 enc.train() dec.train() for encoder_input, decoder_input, target in progress_bar(train_loader, parent=mb): optimizer.zero_grad() enc_vec = enc(encoder_input) h = enc_vec[:, -1, :] _, c = dec.initHidden(batch_size) x = decoder_input[:, 0] pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) rand = np.random.random() pred += [x] if rand < force_teacher: x = decoder_input[:, pi] pred = torch.cat(pred, dim=1) # loss = quantile_loss(pred, target) loss = criterion(pred, target) loss.backward() optimizer.step() train_loss += loss.item() test_loss = 0 enc.eval() dec.eval() for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb): with torch.no_grad(): enc_vec = enc(encoder_input) h = enc_vec[:, -1, :] _, c = dec.initHidden(batch_size) x = decoder_input[:, 0] pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) pred += [x] pred = torch.cat(pred, dim=1) # loss = quantile_loss(pred, target) loss = criterion(pred, target) test_loss += loss.item() print( f"Epoch {ep} Train Loss {train_loss/len(train_loader)} Test Loss {test_loss/len(test_loader)}") if not os.path.exists("models"): os.mkdir("models") torch.save(enc.state_dict(), f"models/{region}_enc.pth") torch.save(dec.state_dict(), f"models/{region}_dec.pth") test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False) rmse = 0 p = 0 predicted = [] true_target = [] enc.eval() dec.eval() for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb): with torch.no_grad(): enc_vec = enc(encoder_input) x = decoder_input[:, 0] h, c = dec.initHidden(1) pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) pred += [x] pred = torch.cat(pred, dim=1) predicted += [pred[0, p].item()] true_target += [target[0, p].item()] predicted = np.array(predicted).reshape(1, -1) predicted = predicted * (train_max - train_min) + train_min true_target = np.array(true_target).reshape(1, -1) true_target = true_target * (train_max - train_min) + train_min rmse, peasonr = calc_metric(predicted, true_target) print(f"{region} RMSE {rmse}") print(f"{region} r {peasonr[0]}") return f"{region} RMSE {rmse} r {peasonr[0]}"
def main(params): try: output_dir = os.path.join( params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M")) os.makedirs(output_dir) except OSError: pass if torch.cuda.is_available() and not params['cuda']: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) writer = SummaryWriter(output_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") SOS_token = '<sos>' EOS_token = '<eos>' PAD_token = '<pad>' TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, lower=True, batch_first=True, init_token=SOS_token, eos_token=EOS_token) # LABEL = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, batch_first=True, init_token='#', eos_token='$') IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True) fields = { 'ans': ('ans', TEXT), 'img_ind': ('img_ind', IMG_IND), 'question': ('question', TEXT) } train, val = TabularDataset.splits( path=params['dataroot'], train=params['input_train'], validation=params['input_test'], format='csv', skip_header=False, fields=fields, ) print("Train data") print(train[0].__dict__.keys()) print(train[0].ans, train[0].img_ind, train[0].question) print("Validation data") print(val[0].__dict__.keys()) print(val[0].ans, val[0].img_ind, val[0].question) print("Building Vocabulary ..") TEXT.build_vocab(train, vectors='glove.6B.100d') vocab = TEXT.vocab PAD_token_ind = vocab.stoi[PAD_token] SOS_token_ind = vocab.stoi[SOS_token] EOS_token_ind = vocab.stoi[EOS_token] print("Creating Embedding from vocab vectors ..") txt_embed = nn.Embedding.from_pretrained(vocab.vectors) print("Text Embeddings are generated of size ", txt_embed.weight.size()) print("Loading Image embeddings ..") with open(params['image_embeddings'], 'rb') as f: img_embs = pkl.load(f)['image_features'] img_embed = nn.Embedding.from_pretrained(torch.FloatTensor(img_embs)) print("Creating Encoder_attn ..") encoder = Encoder_attn(img_embed, txt_embed, params) print(encoder) print("Creating Decoder ..") decoder = Decoder(txt_embed, params) print(decoder) criterion = torch.nn.PairwiseDistance(keepdim=False) criterion.to(device) encoder.to(device) decoder.to(device) ## [Completed] TODO(Jay) : Remove this check and use .to(device) # if params['cuda']: # encoder.cuda() # decoder.cuda() # criterion.cuda() encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=params['lr'], weight_decay=1e-5, amsgrad=True) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=params['lr'], weight_decay=1e-5, amsgrad=True) encoder_LR_scheduler = ReduceLROnPlateau(encoder_optimizer, 'min', patience=1) decoder_LR_scheduler = ReduceLROnPlateau(decoder_optimizer, 'min', patience=1) if params['use_checkpoint']: checkpoint = torch.load(params['enc_dec_model']) encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) encoder_optimizer.load_state_dict( checkpoint['encoder_optimizer_state_dict']) decoder_optimizer.load_state_dict( checkpoint['decoder_optimizer_state_dict']) encoder_LR_scheduler.load_state_dict( checkpoint['encoder_LR_scheduler']) decoder_LR_scheduler.load_state_dict( checkpoint['decoder_LR_scheduler']) for epoch in range(params['niter']): train_iter, val_iter = Iterator.splits( (train, val), batch_sizes=(params['batch_size'], params['batch_size']), sort=False, shuffle=True, device=device) for is_train in (True, False): print('Is Training: ', is_train) if is_train: encoder.train() decoder.train() data_iter = train_iter else: encoder.eval() decoder.eval() data_iter = val_iter total_loss = 0 total_acc = 0 with torch.set_grad_enabled(is_train): for i, row in enumerate(data_iter, 1): if len(row) < params['batch_size']: continue encoder.zero_grad() decoder.zero_grad() ans, img_ind, question = row.ans, row.img_ind, row.question batch_size = params['batch_size'] ## target_length-1 since we are not predicting SOS token target_length = ans.shape[1] - 1 encoder.hidden = encoder.init_hidden(params) ans = ans.to(device) img_ind = img_ind.to(device) question = question.to(device) encoder.hidden = (encoder.hidden[0].to(device), encoder.hidden[1].to(device)) ans_embed = txt_embed(ans) encoder_output = encoder(img_ind, question) decoder_input = ans_embed[:, 0].reshape( (batch_size, 1, -1)) ## (batch_size, 1) check again ans_embed = ans_embed[:, 1:] ## removed the SOS token ans = ans[:, 1:] ## removed the SOS token decoder_hidden = decoder.init_hidden( encoder_output, params) if params['cuda']: decoder_hidden = (decoder_hidden[0].cuda(), decoder_hidden[1].cuda()) outputs = torch.zeros(batch_size, target_length, params['txt_emb_size']) ## [Completed] TODO(Jay) : remove the sos token from the ans and ans_embed before calc loss and acc for di in range(target_length - 1): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) ## TODO(Jay) : Detach the input from history decoder_input = decoder_output outputs[:, di, :] = decoder_output.reshape( batch_size, -1) filtered_labels, filtered_label_embeds, filtered_outputs = filterOutput( outputs.reshape(batch_size * target_length, -1), ans.reshape(batch_size * target_length, -1), ans_embed.reshape(batch_size * target_length, -1), PAD_token_ind) filtered_label_embeds = filtered_label_embeds.to(device) filtered_outputs = filtered_outputs.to(device) batch_loss = maskedLoss(filtered_label_embeds, filtered_outputs, criterion) batch_acc = word_accuracy(filtered_outputs, vocab.vectors.to(device), filtered_labels) total_loss += batch_loss.item() total_acc += batch_acc if is_train: if i % 1000 == 0: print( '[%d/%d][%d/%d] train_loss: %.4f, Accuracy: %.4f' % (epoch, params['niter'], i, len(data_iter), total_loss / i, total_acc / i)) batch_loss.backward() encoder_optimizer.step() decoder_optimizer.step() avg_loss = total_loss / len(data_iter) avg_acc = total_acc / len(data_iter) if is_train: PATH = os.path.join(output_dir, 'enc_dec_model.pth') torch.save( { 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'encoder_LR_scheduler': encoder_LR_scheduler.state_dict(), 'decoder_LR_scheduler': decoder_LR_scheduler.state_dict(), }, PATH) writer.add_scalars('data', { 'train_loss': avg_loss, 'train_acc': avg_acc }, epoch) else: print('Calculating Validation loss') print('val_loss: %.4f, Accuracy: %.4f' % (avg_loss, avg_acc)) encoder_LR_scheduler.step(avg_loss) decoder_LR_scheduler.step(avg_loss) writer.add_scalars('data', { 'val_loss': avg_loss, 'val_acc': avg_acc }, epoch) writer.close()
def main(): options = parse_args() is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) for dataset_name in options.dataset: results = {"best": {}, "mean": {}, "highest_prob": {}} for checkpoint_path in options.checkpoint: checkpoint_name, _ = os.path.splitext( os.path.basename(checkpoint_path)) checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda) if checkpoint_path else default_checkpoint) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") test_set = test_sets[dataset_name] dataset = CrohmeDataset( test_set["groundtruth"], tokensfile, root=test_set["root"], transform=transformers, ) data_loader = DataLoader( dataset, batch_size=options.batch_size, shuffle=False, num_workers=options.num_workers, collate_fn=collate_batch, ) enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device) dec = Decoder( len(dataset.id_to_token), low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.eval() dec.eval() result = evaluate( enc, dec, data_loader=data_loader, device=device, checkpoint=checkpoint, beam_width=options.beam_width, prefix=options.prefix, ) results["best"][checkpoint_name] = result["best"] results["mean"][checkpoint_name] = result["mean"] results["highest_prob"][checkpoint_name] = result["highest_prob"] highest_prob_err_table, highest_prob_correct_table = create_markdown_tables( results["highest_prob"]) best_err_table, best_correct_table = create_markdown_tables( results["best"]) mean_err_table, mean_correct_table = create_markdown_tables( results["mean"]) print(("\n# Dataset {name}\n\n" "Beam width: {beam_width}\n\n" "## Highest Probability\n\n{highest_prob_err_table}\n\n" "{highest_prob_correct_table}\n\n" "## Best\n\n{best_err_table}\n\n{best_correct_table}\n\n" "## Mean\n\n{mean_err_table}\n\n{mean_correct_table}").format( name=dataset_name, beam_width=options.beam_width, highest_prob_err_table=highest_prob_err_table, highest_prob_correct_table=highest_prob_correct_table, best_err_table=best_err_table, best_correct_table=best_correct_table, mean_err_table=mean_err_table, mean_correct_table=mean_correct_table, ))
def DDF(cfg): filter_list_path = Path(utils.to_absolute_path(cfg.filter_list)) with open(filter_list_path) as file: filter_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter(cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Low": for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path # librosa.load (it will return audio time series, and its sampling rate) wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 path = out_dir / out_filename # to return raw recording in mel-spectrogram without any filtering if cfg.output_type == "Embedding": mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db( mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).squeeze().to(device).numpy() np.savetxt(path.with_suffix(".mel.txt"), mel) # to return raw recording in waveform without any filtering if cfg.output_type == "Recording": librosa.output.write_wav(path.with_suffix(".wav"), wav.astype(np.float32), sr=cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Moderate": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) speaker = decoder.speaker(speaker) vq = vq.squeeze().to(device).numpy() speaker = speaker.squeeze().to(device).numpy() np.savetxt(path.with_suffix(".vq.txt"), vq) np.savetxt(path.with_suffix(".speaker.txt"), speaker) #--------------------------------------- if cfg.privacy_preference == "High": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) vq = vq.squeeze().cpu().numpy() np.savetxt(path.with_suffix(".vq.txt"), vq)
swa = SWA(number_swa_models=number_swa_models) scheduler_decoder.curr_iter = iterations if finetune_encoder: scheduler_encoder.curr_iter = iterations print scheduler_decoder.get_lr()[0] print "SWA decoder curr lr", scheduler_decoder.print_lr()[0] else: swa = SWA(number_swa_models=0) print "# of SWA models 0" else: swa = SWA(number_swa_models=0) print "# of SWA models 0" encoder.eval() decoder.eval() if swa_params: encoder_swa.eval() decoder_swa.eval() criterion = nn.CrossEntropyLoss() dataset = COCOMultiLabel(train=True, classification=False, image_path=args.image_path, sort_by_freq=args.sort_by_freq) dataset_val = COCOMultiLabel(train=False, classification=False, image_path=args.image_path, sort_by_freq=args.sort_by_freq) dataloader = DataLoader(dataset,
class FNM(object): def __init__(self, args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device_id self.batch_size = args.batch_size self.lr = args.lr self.profile_list_path = args.profile_list self.front_list_path = args.front_list self.profile_path = args.profile_path self.front_path = args.front_path self.test_path = args.test_path self.test_list = args.test_list self.crop_size = args.ori_height self.image_size = args.height self.res_n = args.res_n self.is_finetune = args.is_finetune self.result_name = args.result_name self.summary_dir = args.summary_dir self.iteration = args.iteration self.weight_decay = args.weight_decay self.decay_flag = args.decay_flag self.print_freq = args.print_freq self.save_freq = args.save_freq self.img_size = args.width self.model_name = args.model_name # For hyper parameters self.lambda_l1 = args.lambda_l1 self.lambda_fea = args.lambda_fea self.lambda_reg = args.lambda_reg self.lambda_gan = args.lambda_gan self.lambda_gp = args.lambda_gp self.channel = args.channel self.device = torch.device("cuda:{}".format(args.device_id)) self.make_dirs() self.build_model() """Define Loss""" self.L1_loss = nn.L1Loss().to(self.device) self.L2_loss = nn.MSELoss().to(self.device) def make_dirs(self): check_folder(self.summary_dir) check_folder(os.path.join("results", self.result_name, "model")) check_folder(os.path.join("results", self.result_name, "img")) def build_model(self): self.expert_net = se50_net( "./other_models/arcface_se50/model_ir_se50.pth").to(self.device) for param in self.expert_net.parameters(): param.requires_grad = False #self.dataset = sample_dataset(self.profile_list_path, self.front_list_path, self.profile_path, self.front_path, self.crop_size, self.image_size) self.front_loader = get_loader(self.front_list_path, self.front_path, self.crop_size, self.image_size, self.batch_size, mode="train", num_workers=8) self.profile_loader = get_loader(self.profile_list_path, self.profile_path, self.crop_size, self.image_size, self.batch_size, mode="train", num_workers=8) self.test_loader = get_loader(self.test_list, self.test_path, self.crop_size, self.image_size, self.batch_size, mode="test", num_workers=8) #self.front_loader = iter(self.front_loader) #self.profile_loader = iter(self.profile_loader) #resnet_blocks resnet_block_list = [] for i in range(self.res_n): resnet_block_list.append(ResnetBlock(512, use_bias=False)) self.body = nn.Sequential(*resnet_block_list).to(self.device) #[b, 512, 7, 7] self.decoder = Decoder().to(self.device) self.dis = Discriminator(self.channel).to(self.device) self.G_optim = torch.optim.Adam(itertools.chain( self.body.parameters(), self.decoder.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) self.D_optim = torch.optim.Adam(itertools.chain(self.dis.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) self.downsample112x112 = nn.Upsample(size=(112, 112), mode='bilinear') def update_lr(self, start_iter): if self.decay_flag and start_iter > (self.iteration // 2): self.G_optim.param_groups[0]['lr'] -= ( self.lr / (self.iteration // 2)) * (start_iter - self.iteration // 2) self.D_optim.param_groups[0]['lr'] -= ( self.lr / (self.iteration // 2)) * (start_iter - self.iteration // 2) def train(self): self.body.train(), self.decoder.train(), self.dis.train() start_iter = 1 if self.is_finetune: model_list = glob( os.path.join("results", self.result_name, "model", "*.pt")) if not len(model_list) == 0: model_list.sort() start_iter = int(model_list[-1].split('_')[-1].split('.')[0]) self.load(os.path.join("results", self.result_name, 'model'), start_iter) print(" [*] Load SUCCESS") self.update_lr(start_iter) print("training start...") start_time = time.time() for step in range(start_iter, self.iteration + 1): self.update_lr(start_iter) try: front_224, front_112 = front_iter.next() if front_224.shape[0] != self.batch_size: raise Exception except: front_iter = iter(self.front_loader) front_224, front_112 = front_iter.next() try: profile_224, profile_112 = profile_iter.next() if profile_224.shape[0] != self.batch_size: raise Exception except: profile_iter = iter(self.profile_loader) profile_224, profile_112 = profile_iter.next() profile_224, front_224, profile_112, front_112 = profile_224.to( self.device), front_224.to(self.device), profile_112.to( self.device), front_112.to(self.device) # Update D self.D_optim.zero_grad() feature_p = self.expert_net.get_feature(profile_112) feature_f = self.expert_net.get_feature(front_112) gen_p = self.decoder(self.body(feature_p)) gen_f = self.decoder(self.body(feature_f)) feature_gen_p = self.expert_net.get_feature( self.downsample112x112(gen_p)) feature_gen_f = self.expert_net.get_feature( self.downsample112x112(gen_f)) d_f = self.dis(front_224) d_gen_p = self.dis(gen_p) d_gen_f = self.dis(gen_f) D_adv_loss = torch.mean( tensor_tuple_sum(d_gen_f) * 0.5 + tensor_tuple_sum(d_gen_p) * 0.5 - tensor_tuple_sum(d_f)) / 5 alpha = torch.rand(gen_p.size(0), 1, 1, 1).to(self.device) inter = (alpha * front_224.data + (1 - alpha) * gen_p.data).requires_grad_(True) out_inter = self.dis(inter) gradient_penalty_loss = ( gradient_penalty(out_inter[0], inter, self.device) + gradient_penalty(out_inter[1], inter, self.device) + gradient_penalty(out_inter[2], inter, self.device) + gradient_penalty(out_inter[3], inter, self.device) + gradient_penalty(out_inter[4], inter, self.device)) / 5 #print("gradient_penalty_loss:{}".format(gradient_penalty_loss)) d_loss = self.lambda_gan * D_adv_loss + self.lambda_gp * gradient_penalty_loss d_loss.backward(retain_graph=True) self.D_optim.step() # Update G self.G_optim.zero_grad() try: front_224, front_112 = front_iter.next() if front_224.shape[0] != self.batch_size: raise Exception except: front_iter = iter(self.front_loader) front_224, front_112 = front_iter.next() try: profile_224, profile_112 = profile_iter.next() if profile_224.shape[0] != self.batch_size: raise Exception except: profile_iter = iter(self.profile_loader) profile_224, profile_112 = profile_iter.next() profile_224, front_224, profile_112, front_112 = profile_224.to( self.device), front_224.to(self.device), profile_112.to( self.device), front_112.to(self.device) feature_p = self.expert_net.get_feature(profile_112) feature_f = self.expert_net.get_feature(front_112) gen_p = self.decoder(self.body(feature_p)) gen_f = self.decoder(self.body(feature_f)) feature_gen_p = self.expert_net.get_feature( self.downsample112x112(gen_p)) feature_gen_f = self.expert_net.get_feature( self.downsample112x112(gen_f)) d_f = self.dis(front_224) d_gen_p = self.dis(gen_p) d_gen_f = self.dis(gen_f) pixel_loss = torch.mean(self.L1_loss(front_224, gen_f)) feature_p_norm = l2_norm(feature_p) feature_f_norm = l2_norm(feature_f) feature_gen_p_norm = l2_norm(feature_gen_p) feature_gen_f_norm = l2_norm(feature_gen_f) perceptual_loss = torch.mean( 0.5 * (1 - torch.sum(torch.mul(feature_p_norm, feature_gen_p_norm), dim=(1, 2, 3))) + 0.5 * (1 - torch.sum(torch.mul(feature_f_norm, feature_gen_f_norm), dim=(1, 2, 3)))) G_adv_loss = -torch.mean( tensor_tuple_sum(d_gen_f) * 0.5 + tensor_tuple_sum(d_gen_p) * 0.5) / 5 g_loss = self.lambda_gan * G_adv_loss + self.lambda_l1 * pixel_loss + self.lambda_fea * perceptual_loss g_loss.backward() self.G_optim.step() print("[%5d/%5d] time: %4.4f d_loss: %.8f, g_loss: %.8f" % (step, self.iteration, time.time() - start_time, d_loss, g_loss)) print("D_adv_loss : %.8f" % (self.lambda_gan * D_adv_loss)) print("G_adv_loss : %.8f" % (self.lambda_gan * G_adv_loss)) print("pixel_loss : %.8f" % (self.lambda_l1 * pixel_loss)) print("perceptual_loss : %.8f" % (self.lambda_fea * perceptual_loss)) print("gp_loss : %.8f" % (self.lambda_gp * gradient_penalty_loss)) with torch.no_grad(): if step % self.print_freq == 0: train_sample_num = 5 test_sample_num = 5 A2B = np.zeros((self.img_size * 4, 0, 3)) self.body.eval(), self.decoder.eval(), self.dis.eval() for _ in range(train_sample_num): try: front_224, front_112 = front_iter.next() if front_224.shape[0] != self.batch_size: raise Exception except: front_iter = iter(self.front_loader) front_224, front_112 = front_iter.next() try: profile_224, profile_112 = profile_iter.next() if profile_224.shape[0] != self.batch_size: raise Exception except: profile_iter = iter(self.profile_loader) profile_224, profile_112 = profile_iter.next() profile_224, front_224, profile_112, front_112 = profile_224.to( self.device), front_224.to( self.device), profile_112.to( self.device), front_112.to(self.device) feature_p = self.expert_net.get_feature(profile_112) feature_f = self.expert_net.get_feature(front_112) gen_p = self.decoder(self.body(feature_p)) gen_f = self.decoder(self.body(feature_f)) A2B = np.concatenate( (A2B, np.concatenate( (RGB2BGR(tensor2numpy(denorm( profile_224[0]))), RGB2BGR(tensor2numpy(denorm(gen_p[0]))), RGB2BGR(tensor2numpy(denorm(front_224[0]))), RGB2BGR(tensor2numpy(denorm(gen_f[0])))), 0)), 1) for _ in range(train_sample_num): show_list = [] for i in range(2): try: test_profile_224, test_profile_112 = test_iter.next( ) if test_profile_224.shape[0] != self.batch_size: raise Exception except: test_iter = iter(self.test_loader) test_profile_224, test_profile_112 = test_iter.next( ) test_profile_224, test_profile_112 = test_profile_224.to( self.device), test_profile_112.to(self.device) test_feature_p = self.expert_net.get_feature( test_profile_112) test_gen_p = self.decoder( self.body(test_feature_p)) show_list.append(test_profile_224[0]) show_list.append(test_gen_p[0]) A2B = np.concatenate( (A2B, np.concatenate( (RGB2BGR(tensor2numpy(denorm(show_list[0]))), RGB2BGR(tensor2numpy(denorm(show_list[1]))), RGB2BGR(tensor2numpy(denorm(show_list[2]))), RGB2BGR(tensor2numpy(denorm(show_list[3])))), 0)), 1) cv2.imwrite( os.path.join("results", self.result_name, 'img', 'A2B_%07d.png' % step), A2B * 255.0) self.body.train(), self.decoder.train(), self.dis.train() if step % self.save_freq == 0: self.save( os.path.join("results", self.result_name, "model"), step) if step % 1000 == 0: params = {} params['body'] = self.body.state_dict() params['decoder'] = self.decoder.state_dict() params['dis'] = self.dis.state_dict() torch.save( params, os.path.join("results", self.result_name, self.model_name + "_params_latest.pt")) def load(self, dir, step): params = torch.load( os.path.join(dir, self.model_name + '_params_%07d.pt' % step)) self.body.load_state_dict(params['body']) self.decoder.load_state_dict(params['decoder']) self.dis.load_state_dict(params['dis']) def save(self, dir, step): params = {} params['body'] = self.body.state_dict() params['decoder'] = self.decoder.state_dict() params['dis'] = self.dis.state_dict() torch.save( params, os.path.join(dir, self.model_name + '_params_%07d.pt' % step)) def demo(self): try: front_224, front_112 = front_iter.next() if front_224.shape[0] != self.batch_size: raise Exception except: front_iter = iter(self.front_loader) front_224, front_112 = front_iter.next() try: profile_224, profile_112 = profile_iter.next() if profile_224.shape[0] != self.batch_size: raise Exception except: profile_iter = iter(self.profile_loader) profile_224, profile_112 = profile_iter.next() profile_224, front_224, profile_112, front_112 = profile_224.to( self.device), front_224.to(self.device), profile_112.to( self.device), front_112.to(self.device) D_face, D_eye, D_nose, D_mouth, D_map = self.dis(profile_224) ''' print("D_face.shape:", D_face.shape) print("D_eye.shape:", D_eye.shape) print("D_nose.shape:", D_nose.shape) print("D_mouth.shape:", D_mouth.shape) ''' cv2.imwrite("profile.jpg", cv2.cvtColor(tensor2im(profile_112), cv2.COLOR_BGR2RGB)) cv2.imwrite("front.jpg", cv2.cvtColor(tensor2im(front_112), cv2.COLOR_BGR2RGB)) feature = self.expert_net.get_feature(profile_224) print(feature.shape) '''
print(translate(captions)) with open(vocab_path, 'rb') as f: vocab = pickle.load(f) vocab_size = len(vocab) print('vocab_size:', vocab_size) dataloader = get_loader(image_dir, caption_path, vocab, batch_size, crop_size, shuffle=True, num_workers=num_workers) encoder = Encoder().to(device) encoder.fine_tune(fine_tune_encoder) decoder = Decoder(attention_dim, embedding_size, lstm_size, vocab_size).to(device) print('Start loading models.') encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) encoder.eval() decoder.eval() sample('data/surf.jpg', vocab, dataloader, encoder, decoder) sample('data/giraffe.png', vocab, dataloader, encoder, decoder)
def convert(cfg): dataset_path = Path(utils.to_absolute_path( "datasets")) / cfg.dataset.path #zerospeech/datasets/2019/english with open(dataset_path / "speakers.json") as file: # 말하는 사람들 이름 써있는 데이터 speakers = sorted(json.load(file)) # speakers라는 객체로 저장 synthesis_list_path = Path(utils.to_absolute_path( cfg.synthesis_list)) # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함 with open(synthesis_list_path) as file: synthesis_list = json.load( file) # datasets/2019/english에 있는 synthesis.json보면됨 in_dir = Path(utils.to_absolute_path( cfg.in_dir)) # ???임. zerospeech 폴더로 경로따면 될듯. (./) out_dir = Path(utils.to_absolute_path( cfg.out_dir)) #???임. 목소리 변환된 결과를 저장할 경로 out_dir.mkdir(exist_ok=True, parents=True) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # gpu안되면 cpu로 encoder = Encoder( **cfg.model.encoder) #ZeroSpeech/config/model/default에 있는 encoder decoder = Decoder( **cfg.model.decoder) #ZeroSpeech/config/model/default에 있는 decoder encoder.to(device) # cpu or gpu decoder.to(device) # cpu or gpu print("Load checkpoint from: {}:".format(cfg.checkpoint) ) ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정 checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage ) # checkpoint에 지정된 weight들을 불러옵니다 encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter( cfg.preprocessing.sr ) #sr:16000으로 조정?? https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다.. for wav_path, speaker_id, out_filename in tqdm( synthesis_list ): #"english/test/S002_0379088085","V002","V002_0379088085" wav_path = in_dir / wav_path # ./english/test/S002_0379088085 wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) #인풋의 음량을 측정인듯 wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to( device) #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입 #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/ speaker = torch.LongTensor([speakers.index(speaker_id) ]).to(device) # 마찬가지로 텐서로 만드는데 #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데, #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다. #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다. # 즉 mel은 소수점있고 speaker는 소숫점 없으니까! with torch.no_grad( ): # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315 z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) output_loudness = meter.integrated_loudness(output) #아웃풋의 음량을 측정인듯 output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경 path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def main(args): # ============================== # Create some folders or files for saving # ============================== if not os.path.exists(args.root_folder): os.mkdir(args.root_folder) loss_path = args.loss_path mertics_path = args.mertics_path epoch_model_path = args.epoch_model_path best_model_path = args.best_model_path generated_captions_path = args.generated_captions_folder_path sentences_show_path = args.sentences_show_path # Transform the format of images # This function in utils.general_tools.py train_transform = get_train_transform() val_transform = get_val_trainsform() # Load vocabulary print("*** Load Vocabulary ***") with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Create data sets # This function in data_load.py train_data = train_load(root=args.train_image_dir, json=args.train_caption_path, vocab=vocab, transform=train_transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_data = val_load(root=args.val_image_dir, json=args.val_caption_path, transform=val_transform, batch_size=1, shuffle=False, num_workers=args.num_workers) # Build model encoder = Encoder(args.hidden_dim, args.fine_tuning).to(device) decoder = Decoder(args.embedding_dim, args.hidden_dim, vocab, len(vocab), args.max_seq_length).to(device) # Select loss function criterion = nn.CrossEntropyLoss().to(device) if args.fine_tuning == True: params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) else: params = decoder.parameters() optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) # Load pretrained model if args.resume == True: checkpoint = torch.load(best_model_path) encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) if args.fine_tuning == False: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] + 1 best_score = checkpoint['best_score'] best_epoch = checkpoint['best_epoch'] # New epoch and score else: start_epoch = 1 best_score = 0 best_epoch = 0 for epoch in range(start_epoch, 10000): print("-" * 20) print("epoch:{}".format(epoch)) # Adjust learning rate when the difference between epoch and best epoch is multiple of 3 if (epoch - best_epoch) > 0 and (epoch - best_epoch) % 4 == 0: # This function in utils.general_tools.py adjust_lr(optimizer, args.shrink_factor) if (epoch - best_epoch) > 10: break print("*** Training complete ***") # ============= # Training # ============= print(" *** Training ***") decoder.train() encoder.train() total_step = len(train_data) epoch_loss = 0 for (images, captions, lengths, img_ids) in tqdm(train_data): images = images.to(device) captions = captions.to(device) # Why do lengths cut 1 and the first dimension of captions from 1 # Because we need to ignore the begining symbol <start> lengths = list(np.array(lengths) - 1) targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] features = encoder(images) predictions = decoder(features, captions, lengths) predictions = pack_padded_sequence(predictions, lengths, batch_first=True)[0] loss = criterion(predictions, targets) epoch_loss += loss.item() decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Save loss information # This function in utils.save_tools.py save_loss(round(epoch_loss / total_step, 3), epoch, loss_path) # ============= # Evaluating # ============= print("*** Evaluating ***") encoder.eval() decoder.eval() generated_captions = [] for image, img_id in tqdm(val_data): image = image.to(device) img_id = img_id[0] features = encoder(image) sentence = decoder.generate(features) sentence = ' '.join(sentence) item = {'image_id': int(img_id), 'caption': sentence} generated_captions.append(item) j = random.randint(1, 100) print('*** Computing metrics ***') # Save current generated captions # This function in utils.save_tools.py captions_json_path = save_generated_captions(generated_captions, epoch, generated_captions_path, args.fine_tuning) # Compute score of metrics # This function in utils.general_tools.py results = coco_metrics(args.val_caption_path, captions_json_path, epoch, sentences_show_path) # Save metrics results # This function in utils.save_tools.py epoch_score = save_metrics(results, epoch, mertics_path) # Update the best score if best_score < epoch_score: best_score = epoch_score best_epoch = epoch save_best_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, best_model_path) print("*** Best score:{} Best epoch:{} ***".format( best_score, best_epoch)) # Save every epoch model save_epoch_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, epoch_model_path, args.fine_tuning)
def main(_): # Load the configuration file. with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Load the vocabularies. src_vocab = Vocab.load(config['data']['src']['vocab']) tgt_vocab = Vocab.load(config['data']['tgt']['vocab']) # Load the training and dev datasets. test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab) # Restore the model. src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) encoder = Encoder(src_vocab_size, config['model']['embedding_dim'], config['model']['bidirection'], config['model']['dropout'], config['model']['layer'], config['model']['mode']) decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim'], config['model']['bidirection'], config['model']['dropout'], config['model']['layer'], config['model']['mode']) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'], 'model.pt') if os.path.exists(ckpt_path): print('Loading checkpoint: %s' % ckpt_path) ckpt = torch.load(ckpt_path) encoder.load_state_dict(ckpt['encoder']) decoder.load_state_dict(ckpt['decoder']) else: print('Unable to find checkpoint. Terminating.') sys.exit(1) encoder.eval() decoder.eval() # Initialize translator. greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab) # Qualitative evaluation - print translations for first couple sentences in # test corpus. for i in range(10): src, tgt = test_data[i] translation = greedy_translator(src) src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()] tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] print('---') print('Source: %s' % ' '.join(src_sentence)) print('Ground truth: %s' % ' '.join(tgt_sentence)) print('Model output: %s' % ' '.join(translated_sentence)) print('---') # Quantitative evaluation - compute corpus level BLEU scores. hypotheses = [] references = [] for src, tgt in test_data: translation = greedy_translator(src) tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] # Remove start and end of sentence tokens. tgt_sentence = tgt_sentence[1:-1] translated_sentence = translated_sentence[1:-1] hypotheses.append(tgt_sentence) references.append([translated_sentence]) print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
def eval_reward(args, shared_model, writer_dir=None): """ For evaluation Arguments: - writer: the tensorboard summary writer directory (note: can't get it working directly with the SummaryWriter object) """ writer = SummaryWriter(log_dir=os.path.join( writer_dir, 'eval')) if writer_dir is not None else None # current episode stats episode_reward = episode_value_mse = episode_td_error = episode_pg_loss = episode_length = 0 # global stats i_episode = 0 total_episode = total_steps = 0 num_goals_achieved = 0 # intilialize the env and models torch.manual_seed(args.seed) env = create_env(args.env_name, framework=args.framework, args=args) set_seed(args.seed, env, args.framework) shared_enc, shared_dec, shared_d_module, shared_r_module = shared_model enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) r_module = R_Module(env.action_space.shape[0], args.dim, discrete=args.discrete, baseline=False, state_space=env.observation_space.shape[0]) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters(), r_module.parameters()) if args.from_checkpoint is not None: model_state, _ = torch.load(args.from_checkpoint) model.load_state_dict(model_state) # set the model to evaluation mode enc.eval() dec.eval() d_module.eval() r_module.eval() # reset the state state = env.reset() state = Variable(torch.from_numpy(state).float()) start = time.time() while total_episode < args.num_episodes: # Sync with the shared model r_module.load_state_dict(shared_r_module.state_dict()) d_module.load_state_dict(shared_d_module.state_dict()) enc.load_state_dict(shared_enc.state_dict()) dec.load_state_dict(shared_dec.state_dict()) # reset stuff cd_p = Variable(torch.zeros(1, args.lstm_dim)) hd_p = Variable(torch.zeros(1, args.lstm_dim)) # for the reward cr_p = Variable(torch.zeros(1, args.lstm_dim)) hr_p = Variable(torch.zeros(1, args.lstm_dim)) i_episode += 1 episode_length = 0 episode_reward = 0 args.local = True args.d = 0 succ, _, episode_reward, episode_length = test(1, args, args, args, d_module, r_module, enc) log("Eval: succ {:.2f}, reward {:.2f}, length {:.2f}".format( succ, episode_reward, episode_length)) # Episode has ended, write the summaries here if writer_dir is not None: # current episode stats writer.add_scalar('eval/episode_reward', episode_reward, i_episode) writer.add_scalar('eval/episode_length', episode_length, i_episode) writer.add_scalar('eval/success', succ, i_episode) time.sleep(args.eval_every) print("sleep")
def convert(): ''' dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) ''' dataset_path = Path('./cfg').absolute() with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) with open(Path("./cfg/cfg.json").absolute()) as file: para = json.load(file) synthesis_list_path = Path('./dataset/english/synthesis.txt').absolute() with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path('./dataset/english').absolute() out_dir = Path('./output').absolute() out_dir.mkdir(exist_ok=True, parents=True) print(synthesis_list) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(in_channels=para['encoder']['in_channels'], channels=para['encoder']['channels'], n_embeddings=para['encoder']['n_embeddings'], embedding_dim=para['encoder']['embedding_dim'], jitter=para['encoder']['jitter']) decoder = Decoder( in_channels=para['decoder']['in_channels'], conditioning_channels=para['decoder']['conditioning_channels'], n_speakers=para['decoder']['n_speakers'], speaker_embedding_dim=para['decoder']['speaker_embedding_dim'], mu_embedding_dim=para['decoder']['mu_embedding_dim'], rnn_channels=para['decoder']['rnn_channels'], fc_channels=para['decoder']['fc_channels'], bits=para['decoder']['bits'], hop_length=para['decoder']['hop_length']) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format('./checkpoint/model.pt')) checkpoint_path = Path('./checkpoint/model.pt').absolute() checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() #meter = pyloudnorm.Meter(160000) print('load finish') for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=para['preprocess']['sr']) #ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, para['preprocess']['preemph']), sr=para['preprocess']['sr'], n_fft=para['preprocess']['n_fft'], n_mels=para['preprocess']['n_mels'], hop_length=para['preprocess']['hop_length'], win_length=para['preprocess']['win_length'], fmin=para['preprocess']['fmin'], power=1) logmel = librosa.amplitude_to_db(mel, top_db=para['preprocess']['top_db']) logmel = logmel / para['preprocess']['top_db'] + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) #output_loudness = meter.integrated_loudness(output) #output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=para['preprocess']['sr'])