def main(args): with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) measurement_score = test(encoder, decoder, vocab, args.num_samples, args.num_hints, args.debug, args.c_step, args.no_avg) if args.msm == "co": scores = cocoEval() scores_u = cocoEval(res='data/captions_val2014_results_u.json') print(scores) print(scores_u) with open(args.filepath, 'w+') as f: pickle.dump((scores, scores_u), f)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models # encoder = EncoderCNN(args.embed_size) # encoder.eval() # evaluation mode (BN uses moving mean/variance) layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters layout_encoder.load_state_dict(torch.load(args.layout_encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # If use gpu if torch.cuda.is_available(): layout_encoder.cuda() decoder.cuda() # validation(layout_encoder,decoder, args,vocab,transform,args.batch_size) out = save_output(layout_encoder,decoder, args,vocab,transform,args.batch_size) with open('bsl_output.txt', 'w') as outfile: json.dump(out, outfile)
def main(args): # Val images folder filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014' onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))] # image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # load vocabulary wrapper pickle file with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(args.embed_size) # build encoder encoder.eval() # evaluation mode by moving mean and variance decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder # load the trained CNN and RNN parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Load all images in val folder for i in onlyfiles: badsize = 0 # count the unload images args_image = filepath + '/' # val folder path with image names args_image = args_image + i # transform image and wrap it to tensor image = load_image(args_image, transform) image_tensor = to_var(image, volatile=True) if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() # generate caption from image try: feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # print out image and generated caption without start and end print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8]) except: badsize = badsize + 1 # count some wrong images
def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 save_loss = [] for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) encoder.zero_grad() decoder.zero_grad() features = encoder(x) targets = pack_padded_sequence(word_index, lengths, batch_first=True)[0] outputs = decoder(features, word_index, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if iteration % 100 == 0: print 'loss', loss.data[0] save_loss.append(loss.data[0]) iteration += 1 torch.save(decoder.state_dict(), 'decoder.pkl') torch.save(encoder.state_dict(), 'encoder.pkl') with open('losses.txt', 'w') as f: print >> f, losses
def main(image): # Configuration for hyper-parameters config = Config() # Image Preprocessing transform = config.test_transform # Load vocabulary with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) # Load the trained model parameters encoder.load_state_dict( torch.load( os.path.join(config.teacher_cnn_path, config.trained_encoder))) decoder.load_state_dict( torch.load( os.path.join(config.teacher_lstm_path, config.trained_decoder))) # Prepare Image image = Image.open(image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)), Variable(torch.zeros(config.num_layers, 1, config.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word_id == 96: sampled_caption.append('<end>') break if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) return sentence
def main(args): vectore_dir = '/root/server/best_model/' # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models # encoder = EncoderCNN(args.embed_size) qvecs_pca = np.load( os.path.join(vectore_dir, "q_2{}.npy".format(args.embed_size))) # encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters # encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image #image = load_image(args.image, transform) #image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): # encoder.cuda() decoder.cuda() data = [] # img_path = args.image # # Prepare Image # image = load_image(img_path, transform) # image_tensor = to_var(image, volatile=True) # Generate caption from image # feature = encoder(image_tensor) num = 29 feature = torch.from_numpy(qvecs_pca[num:num + 1, :]).cuda() #pdb.set_trace() sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models #encoder = AttnEncoder(ResidualBlock, [3, 3, 3]) encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) # decoder = AttnDecoderRnn(args.feature_size, args.hidden_size, # len(vocab), args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print('load') # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) print('load') # If use gpu if torch.cuda.is_available(): encoder.cuda(1) decoder.cuda(1) trg_bitmap_dir = args.root_path + 'bitmap/' save_directory = 'predict_base/' svg_from_out = args.root_path + save_directory + 'svg/' # svg from output caption bitmap_from_out = args.root_path + save_directory + 'bitmap/' #bitmap from out caption if not os.path.exists(bitmap_from_out): os.makedirs(bitmap_from_out) if not os.path.exists(svg_from_out): os.makedirs(svg_from_out) test_list = os.listdir(trg_bitmap_dir) for i, fname in enumerate(test_list): print(fname) test_path = trg_bitmap_dir + fname test_image = load_image(test_path, transform) image_tensor = to_var(test_image) in_sentence = gen_caption_from_image(image_tensor, encoder, decoder, vocab) print(in_sentence) image_matrix = cv2.imread(test_path) doc = gen_svg_from_predict(in_sentence.split(' '), image_matrix) with open(os.path.join(svg_from_out, fname.split('.')[0]+'.svg'), 'w+') as f: f.write(doc) cairosvg.svg2png(url=svg_from_out+ fname.split('.')[0] + '.svg', write_to= bitmap_from_out+fname)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. sentence = sentence.replace('<start> ', '').replace(' <end>', '').replace('.', '').strip() translator = Translator() sentence_indo = translator.translate(sentence, dest='id').text print('This is an image of: ' + sentence_indo) tts = gTTS(sentence_indo, 'id') tts.save('result.mp3') playsound('result.mp3') image = Image.open(args.image) plt.imshow(np.asarray(image)) plt.show()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) alexnet = models.alexnet(pretrained=True) alexnet2 = AlexNet2(alexnet) # Build Models encoder = EncoderCNN(4096, args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)), Variable(torch.zeros(args.num_layers, 1, args.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() alexnet2.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image alexnet2(image_tensor) feature = encoder(alexnet2.fc7_value) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image #image = load_image(args.image, transform) #image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() data = [] try: img_path = args.image # Prepare Image image = load_image(img_path, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image feature = encoder(image_tensor) #pdb.set_trace() sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) data.append({'key': img_path.split('/')[-1], 'sentence': sentence}) except: print(img_path)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image image_dir = args.image images = os.listdir(image_dir) for image_id in images: if not image_id.endswith('.jpg'): continue image = os.path.join(image_dir, image_id) image = load_image(image, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image try: feature, cnn_features = encoder(image_tensor) sampled_ids = decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() except: continue # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (image_id + '\t' + sentence)
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sentence = decode(feature,[],decoder,vocab) print (sentence) user_input = raw_input("Does it make sense to you?(y/n)\n") if str(user_input) == "n": f = open('data/step_1/caption_1.txt','r') ground_true = f.read() teach_wordid = [] teach_wordid.append(vocab.word2idx["<start>"]) while(True): print "This is the ground true:\n"+ground_true+"\n"+\ "###################################################\n" reference = ground_true.split() hypothesis = sentence.split() BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis) print "Current BLEU score is "+str(BLEUscore) word = raw_input("next word:\n") word_idx = vocab.word2idx[word] teach_wordid.append(word_idx) sentence = decode(feature,teach_wordid,decoder,vocab) print "###################################################\n" print "Current Translated sentence is: \n"+sentence+"\n"
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() bar = Bar('Processing', max=100) for i in range(100): bar.next() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) bar.finish() # Print out image and generated caption. print("\n") print(sentence) image = Image.open(args.image) imgplot = plt.imshow(np.asarray(image)) plt.show()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), #transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() image_tensor = image_tensor.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature, args.length) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word != '<start>' and word != '<end>': sampled_caption.append(word) if word == '<end>': break sentence = ''.join(sampled_caption) # Print out image and generated caption. print(sentence)
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, input, corpus): voc, pairs, valid_pairs, test_pairs = loadPrepareData(corpus) print('Building encoder and decoder ...') '''# attribute embeddings attr_size = 64 attr_num = 2 with open(os.path.join(save_dir, 'user_item.pkl'), 'rb') as fp: user_dict, item_dict = pickle.load(fp) num_user = len(user_dict) num_item = len(item_dict) attr_embeddings = [] attr_embeddings.append(nn.Embedding(num_user, attr_size)) attr_embeddings.append(nn.Embedding(num_item, attr_size)) if USE_CUDA: for attr_embedding in attr_embeddings: attr_embedding = attr_embedding.cuda() encoder = AttributeEncoder(attr_size, attr_num, hidden_size, attr_embeddings, n_layers) ''' embedding = nn.Embedding(voc.n_words, hidden_size, padding_idx=0) # word embedding encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers) attn_model = 'concat' decoder = DecoderRNN(embedding, hidden_size, voc.n_words, n_layers) checkpoint = torch.load(modelFile) encoder.load_state_dict(checkpoint['en']) decoder.load_state_dict(checkpoint['de']) # train mode set to false, effect only on dropout, batchNorm encoder.train(False) decoder.train(False) if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 2)
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def main(): # Configuration for hyper-parameters config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models teachercnn = EncoderCNN(config.embed_size) teachercnn.eval() studentcnn = StudentCNN_Model1(config.embed_size) #Load the best teacher model teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, len(vocab), config.num_layers/2) if torch.cuda.is_available(): teachercnn.cuda() studentcnn.cuda() studentlstm.cuda() # Loss and Optimizer criterion_lstm = nn.CrossEntropyLoss() criterion_cnn = nn.MSELoss() params = list(studentlstm.parameters()) + list(studentcnn.parameters()) optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate) optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate) print('entering in to training loop') # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer_lstm.zero_grad() optimizer_cnn.zero_grad() features_tr = teachercnn(images) features_st = studentcnn(images) outputs = studentlstm(features_st, captions, lengths) loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets) loss.backward() optimizer_cnn.step() optimizer_lstm.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(studentlstm.state_dict(), os.path.join(config.student_lstm_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(studentcnn.state_dict(), os.path.join(config.student_cnn_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' if decoder_state != 'new': start_epoch = int(decoder_state.split('-')[1]) print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Training on vanilla loss (using new model). Started {} .\n". format(str(datetime.now()))) f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) out = decoder(features, captions, lengths) loss = criterion(out, targets) batch_loss.append(loss.data[0]) loss.backward() optimizer.step() # # Print log info # if i % args.log_step == 0: # print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # with open(args.model_path + args.logfile, 'a') as f: # f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
# Specify values for embed_size and hidden_size - we use the same values as in training step embed_size = 256 hidden_size = 512 # Get the vocabulary and its size vocab = data_loader.dataset.vocab vocab_size = len(vocab) # Initialize the encoder and decoder, and set each to inference mode encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the pre-trained weights encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) # Move models to GPU if CUDA is available. if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # In[5]: x = get_prediction(data_loader, encoder, decoder, vocab) # In[6]: print(x)
def main(): #write predicted caption if not os.path.exists(args['generate_caption_path']): os.makedirs(args['generate_caption_path']) caption_string = os.path.join(args['generate_caption_path'], "caption_ncrt_class5.txt") #mode = "a" if os.path.exists(caption_string) else "w" fp =open(caption_string, "w+") # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.9638, 0.9638, 0.9638), (0.1861, 0.1861, 0.1861))]) # Load vocabulary wrapper with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args['embed_size']) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers'], max_seq_length=50) decoder.eval() # Load the trained model parameters encoder.load_state_dict(torch.load(args['encoder_path'])) decoder.load_state_dict(torch.load(args['decoder_path'])) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image image_dir = args['image_path'] images = os.listdir(image_dir) i = 1 for image_id in images: #print('i->',i) #i = i+1 if not image_id.endswith('.jpg'): continue image = os.path.join(image_dir, image_id) image = load_image(image, transform) image_tensor = image.cuda() # Generate caption from image try: feature, cnn_features = encoder(image_tensor) sampled_ids = decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() except: continue #print('image_ids->',image_id) # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print ('i->', i, image_id + '\t' + sentence) fp.write(image_id) fp.write('\t') fp.write(sentence) if i<398: fp.write("\n") i = i+1 fp.close()
def main(args): torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.MSCOCO_result, args.coco_detection_result, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, dummy_object=99, yolo=False) # Build the models encoder = EncoderCNN(args.embed_size) # the layout encoder hidden state size must be the same with decoder input size layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() layout_encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths, label_seqs, location_seqs, visual_seqs, layout_lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize # decoder.zero_grad() # layout_encoder.zero_grad() # encoder.zero_grad() # Modify This part for using visual features or not # features = encoder(images) layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths) # comb_features = features + layout_encoding comb_features = layout_encoding outputs = decoder(comb_features, captions, lengths) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( layout_encoder.state_dict(), os.path.join( args.model_path, 'layout_encoding-%d-%d.pkl' % (epoch + 1, i + 1)))
def train(batch_size=32, vocab_threshold=5, vocab_from_file=True, embed_size=256, hidden_size=512, num_epochs=10, latest_model=None, cocoapi_dir="./Coco/"): # Keep track of train and validation losses and validation Bleu-4 scores by epoch train_losses = [] # Define a transform to pre-process the training images transform_train = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Build data loader, applying the transforms train_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=cocoapi_dir) # The size of the vocabulary vocab_size = len(train_loader.dataset.vocab) # Initialize the encoder and decoder checkpoint = None if latest_model: checkpoint = torch.load(latest_model) start_epoch = 1 if checkpoint: train_losses = checkpoint['train_losses'] val_losses = checkpoint['val_losses'] start_epoch = checkpoint['epoch'] encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) if checkpoint: encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) # Move models to GPU if CUDA is available if torch.cuda.is_available(): torch.cuda.set_device(1) encoder.cuda() decoder.cuda() # Define the loss function loss = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # Specify the learnable parameters of the model params = list(decoder.parameters()) + list( encoder.embed.parameters()) + list(encoder.bn.parameters()) # Define the optimizer optimizer = torch.optim.Adam(params=params, lr=0.001) if checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) # Set the total number of training and validation steps per epoch total_train_step = math.ceil( len(train_loader.dataset.caption_lengths) / train_loader.batch_sampler.batch_size) start_time = time.time() for epoch in range(start_epoch, num_epochs + 1): train_loss = train_one(train_loader, encoder, decoder, loss, optimizer, vocab_size, epoch, total_train_step) train_losses.append(train_loss) # Save the entire model anyway, regardless of being the best model so far or not filename = os.path.join("./models", "model-{}.pkl".format(epoch)) save_epoch(filename, encoder, decoder, optimizer, train_losses, epoch) print("Epoch [%d/%d] took %ds" % (epoch, num_epochs, time.time() - start_time)) start_time = time.time()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027)) ]) # Build vocab vocab = build_vocab(args.root_path, threshold=0) vocab_path = args.vocab_path with open(vocab_path, 'wb') as f: pickle.dump(vocab, f) len_vocab = vocab.idx print(vocab.idx2word) # Build data loader data_loader = get_loader(args.root_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) #Build atten models if torch.cuda.is_available(): encoder.cuda(1) decoder.cuda(1) # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # make one hot # cap_ = torch.unsqueeze(captions,2) # one_hot_ = torch.FloatTensor(captions.size(0),captions.size(1),len_vocab).zero_() # one_hot_caption = one_hot_.scatter_(2, cap_, 1) # Set mini-batch dataset images = to_var(images) captions = to_var(captions) #captions_ = to_var(one_hot_caption) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) captions = captions.view(-1) outputs = outputs.view(-1, len_vocab) loss = criterion(outputs, targets) loss.backward() optimizer.step() #print(targets) #print(outputs) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) #test set accuracy #print(outputs.max(1)[1]) outputs_np = outputs.max(1)[1].cpu().data.numpy() targets_np = targets.cpu().data.numpy() print(outputs_np) print(targets_np) location_match = 0 size_match = 0 shape_match = 0 exact_match = 0 for i in range(len(targets_np)): if outputs_np[i] == targets_np[i]: exact_match += 1 if i >= args.batch_size and i < args.batch_size * 2 and outputs_np[ i] == targets_np[i]: shape_match += 1 elif i >= args.batch_size * 2 and i < args.batch_size * 3 and outputs_np[ i] == targets_np[i]: location_match += 1 elif i >= args.batch_size * 3 and i < args.batch_size * 4 and outputs_np[ i] == targets_np[i]: size_match += 1 print( 'location match : %.4f, shape match : %.4f, exact_match: %.4f' % (location_match / (args.batch_size), shape_match / args.batch_size, exact_match / len(targets_np))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(): # Configuration for hyper-parameters torch.cuda.set_device(0) config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader train_image_path = os.path.join(config.image_path, 'train2017') json_path = os.path.join(config.caption_path, 'captions_train2017.json') train_loader = get_data_loader(train_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) val_image_path = os.path.join(config.image_path, 'val2017') json_path = os.path.join(config.caption_path, 'captions_val2017.json') val_loader = get_data_loader(val_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) print('entering in to training loop') # Train the Models with open('train1_log.txt', 'w') as logfile: logfile.write('Validation Error,Training Error') for epoch in range(0, 25): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i + 1) % config.save_step == 0: torch.save( encoder.state_dict(), os.path.join(config.teacher_cnn_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( decoder.state_dict(), os.path.join(config.teacher_lstm_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) print('Just Completed an Epoch, Initite Validation Error Test') avgvalloss = 0 for j, (images, captions, lengths, img_ids) in enumerate(val_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) valloss = criterion(outputs, targets) if j == 0: avgvalloss = valloss.data[0] avgvalloss = (avgvalloss + valloss.data[0]) / 2 if ((j + 1) % 1000 == 0): print('Average Validation Loss: %.4f' % (avgvalloss)) logfile.write( str(avgvalloss) + ',' + str(loss.data[0]) + str('\n')) break
def main(): # load vocab Data here! with open('VocabData.pkl', 'rb') as f: VocabData = pickle.load(f) with open('FullImageCaps.pkl', 'rb') as f: FullImageCaps = pickle.load(f) # FullImageCaps_sub = loadData("full_image_descriptions.json") coco = loadCoco('captions_train2017.json') data = FullImageCaps + coco print(len(data) / 128) recovery = sys.argv[2] mode = sys.argv[1] lmdata = LMDataset(VocabData, data) lmloader = lmdata.getLoader(batchSize=128, shuffle=True) testloader = lmdata.getLoader(batchSize=1, shuffle=False) embedding = torch.Tensor(lmdata.embedding) vocab_size = len(lmdata.wordDict) max_len = 100 hidden_size = 1024 embedding_size = 300 max_epoch = 10 sos_id = lmdata.sos_id eos_id = lmdata.eos_id pad_id = lmdata.pad_id wordDict = VocabData['word_dict'] rev_vocab = [''] * vocab_size for word in wordDict: rev_vocab[wordDict[word]] = word they = torch.zeros(1, vocab_size) are = torch.zeros(1, vocab_size) students = torch.zeros(1, vocab_size) _from = torch.zeros(1, vocab_size) that = torch.zeros(1, vocab_size) school = torch.zeros(1, vocab_size) they_id = wordDict['they'] are_id = wordDict['are'] students_id = wordDict['students'] from_id = wordDict['from'] that_id = wordDict['that'] school_id = wordDict['school'] they[0, they_id] = 1 are[0, are_id] = 1 students[0, students_id] = 1 _from[0, from_id] = 1 that[0, that_id] = 1 school[0, school_id] = 1 strange_sentence = torch.cat([they, are, are, are, are, are], 0).unsqueeze(0) regular_sentence = torch.cat([they, are, students, _from, that, school], 0).unsqueeze(0) PATH = 'LMcheckpoint(1)' model = DecoderRNN(vocab_size, max_len, hidden_size, embedding_size, sos_id, eos_id, embedding_parameter=embedding, rnn_cell='lstm') if recovery == '1': model = loadCheckpoint(PATH, model) optimizer = optim.Adam(model.parameters(), lr=0.0002) criterion = nn.NLLLoss(ignore_index=pad_id) if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() if mode == 'train': train_LM(lmloader, model, optimizer, criterion, pad_id, max_epoch, max_len) if mode == 'test': lm_loss = LanguageModelLoss(PATH, vocab_size, max_len, hidden_size, embedding_size, sos_id, eos_id, use_prob_vector=True) loss1 = lm_loss(strange_sentence) loss2 = lm_loss(regular_sentence) print(loss1.item(), loss2.item()) sampleSentence(model, testloader, rev_vocab)
def main(args): with open('./data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) if args.test_prop0: decoder.test_h_from_c() return if args.test_c_step: data_points = test(encoder, decoder, vocab, args.num_samples, args.num_hints) with open(args.filepath, 'w+') as f: pickle.dump(data_points, f) print("Done sampling for c_step evaluation. Data saved to {}".format( args.filepath)) return measurement_score = test(encoder, decoder, vocab, args.num_samples, args.num_hints, args.debug, args.c_step, args.no_avg) if args.msm == "ps": if not args.no_avg: print "ground truth prediction score without update\n" + str( measurement_score[0]) print "ground truth prediction score with update\n" + str( measurement_score[1]) print "Difference\n" + str(measurement_score[1] - measurement_score[0]) else: with open(args.filepath, 'w+') as f: pickle.dump(measurement_score, f) print "Done. Data saved to {}".format(args.filepath) elif args.msm == "ce": if not args.no_avg: print "Cross Entropy Loss without update\n" + str( measurement_score[0]) print "Cross Entropy Loss with update\n" + str( measurement_score[1]) print "Difference\n" + str(measurement_score[1] - measurement_score[0]) else: with open(args.filepath, 'w+') as f: pickle.dump(measurement_score, f) print "Done. Data saved to {}".format(args.filepath) elif args.msm == "co": scores = cocoEval() scores_u = cocoEval(res='data/captions_val2014_results_u.json') print(scores) print(scores_u)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #Load vocab_list for uniskip vocab_list = pd.read_csv("./data/vocab_list.csv", header=None) vocab_list = vocab_list.values.tolist()[0] #Build data loader data_loader = get_loader(args.image_dir, args.img_embeddings_dir, args.data_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #im_encoder = preprocess_get_model.model() attention = T_Att() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, args.dropout) uniskip = UniSkip('./data/skip-thoughts', vocab_list) decoder.eval() if torch.cuda.is_available(): #im_encoder.cuda() attention.cuda() decoder.cuda() uniskip.cuda() attention.load_state_dict(torch.load(args.attention_path)) decoder.load_state_dict(torch.load(args.decoder_path)) for i, (images, captions, cap_lengths, qa, qa_lengths, vocab_words) in enumerate(data_loader): # # Set mini-batch dataset img_embeddings = to_var(images.data, volatile=True) captions = to_var(captions) # qa = to_var(qa) # targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0] # # Forward, Backward and Optimize # decoder.zero_grad() # attention.zero_grad() # #features = encoder(images) #img_embeddings = im_encoder(images) #uniskip = UniSkip('/Users/tushar/Downloads/code/data/skip-thoughts', vocab_list) cap_embeddings = uniskip(captions, cap_lengths) cap_embeddings = cap_embeddings.data img_embeddings = img_embeddings.data ctx_vec = attention(img_embeddings, cap_embeddings) outputs = decoder.sample(ctx_vec) output_ids = outputs.cpu().data.numpy() qa = qa.numpy() qa = qa[0] # predicted_q = [] # predicted_a = [] sample = [] # flag = -1 for word_id in output_ids: word = vocab.idx2word[word_id] sample.append(word) # if word == '<end>': # if flag == -1: # predicted_q = sample # sample = [] # flag = 0 # else: # predicted_a = sample # predicted_q = ' '.join(predicted_q[1:]) # predicted_a = ' '.join(predicted_a[1:]) sample = ' '.join(sample) actual = [] # print("predicted q was : " + predicted_q) for word_id in qa: word = vocab.idx2word[word_id] actual.append(word) actual = ' '.join(actual) #print(im_id) print("actual_qa : " + actual + " | predicted_qa : " + sample)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # Composea all processing together, to a tensor with (C,H,W) and value in range (0 - 1) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) print("cap size %s" % str(captions.size())) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] print(targets) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) print("cnn feats %s" % str(features.size())) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) worker_thread_count = 1 retry_for_failed = 2 # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.L1Loss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): processed_items = [] threads = [] has_data_to_process = True def do_request(item): position = item['position'] #print(position) #print(item) retry = retry_for_failed while retry: r = requests.post('http://localhost:4567/', data=item) if r.status_code == 200: pil = Image.open(io.BytesIO(r.content)).convert('RGB') processed_items[position] = transform(pil) #print(position, processed_items[position]) break else: print("shouldb be here") time.sleep(2) retry -= 1 # Set mini-batch dataset image_tensors = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(images.size()) #print(torch.equal(images[0] ,images[1])) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_tensors) outputs = decoder(features, captions, lengths) codes = [] def worker(): while items_to_process.qsize() > 0 or has_data_to_process: item = items_to_process.get() if item is None: break do_request(item) items_to_process.task_done() print("ended thread processing") for j in range(worker_thread_count): t = threading.Thread(target=worker) t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() threads.append(t) for ii, image in enumerate(images): image_tensor = to_var(image.unsqueeze(0), volatile=True) feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) payload = {'code': sentence} data = {'position': ii, 'code': sentence} items_to_process.put(data) processed_items.append('failed') codes.append(sentence) has_data_to_process = False print(codes) print(items_to_process.qsize()) print(image.size()) print("waiting for threads") for t in threads: t.join() print("done reassembling images") for t in threads: t.shutdown = True t.join() bad_value = False for pi in processed_items: if isinstance(pi, str) and pi == "failed": bad_value = True if bad_value == True: print("failed conversion,skipping batch") continue output_tensor = torch.FloatTensor(len(processed_items), 3, images.size()[2], images.size()[3]) for ii, image_tensor in enumerate(processed_items): output_tensor[ii] = processed_items[ii] output_var = to_var(output_tensor, False) target_var = to_var(images, False) #loss = criterion(output_var,target_var) print("loss") print(loss) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters #print args.encoder_path encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) total_num = len(data_loader)*args.batch_size print total_num num_correct=0 tested=0 hypotheses=[] references=[] for i, (images, captions, lengths) in enumerate(data_loader): tested += args.batch_size if i==1: break; # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image images = to_var(images, volatile=True) captions = to_var(captions) #targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] max_sent_length=captions[-1].size(0) print max_sent_length,'length' print captions.size(),'caption_size' #print captions[0].size() #print captions[0] #print targets.size() # Generate caption from image features=encoder(images) sampled_captions = decoder.sample(features,max_sent_length) targets=torch.transpose(sampled_captions.view(max_sent_length,-1),0,1); print targets.size(),'ans' #print targets #print captions ref_sents=translate(captions,vocab) hypo_sents=translate(targets,vocab) references.extend(ref_sents) hypotheses.extend(hypo_sents) num_correct_t = targets.data.eq(captions.data).sum() print num_correct_t,'num correct' num_correct += num_correct_t #feature = encoder(image_tensor) #sampled_ids = decoder.sample(feature) #sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words #sampled_caption = [] #for word_id in sampled_ids: # word = vocab.idx2word[word_id] # sampled_caption.append(word) # if word == '<end>': # break #sentence = ' '.join(sampled_caption) # Print out image and generated caption. #print (sentence) hypo_ref_out=(hypotheses,references) with open('hypo_out.txt', 'wb') as handle: pickle.dump(hypo_ref_out,handle) print len(hypotheses) print hypotheses[0:10] print references[0:10] bleu_score=bleu.BLEU(hypotheses,[references]) print bleu_score print 'num_correct',num_correct,'total',tested,total_num score = BLEU.corpus_bleu(references,hypotheses) score1 = BLEU.corpus_bleu(references,hypotheses,weights=[1,0,0,0]) score2 = BLEU.corpus_bleu(references, hypotheses,weights=[0.5,0.5,0,0]) score3 = BLEU.corpus_bleu(references, hypotheses,weights=[0.3,0.3,0.3,0]) score4 = BLEU.corpus_bleu(references, hypotheses, weights=[0.25,0.25,0.25,0.25]) print score,score1,score2,score3,score4