示例#1
0
    def __init__(self):
        self.transform = transforms.Compose([ 
            transforms.Resize(256),                          # smaller edge of image resized to 256
            transforms.CenterCrop(224),                      # get 224x224 crop from the center
            transforms.ToTensor(),                           # convert the PIL Image to a tensor
            transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                                 (0.229, 0.224, 0.225))])
        
        # Load cherckpoint with best model
        self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu')
        # Specify values for embed_size and hidden_size - we use the same values as in training step
        self.embed_size = 512
        self.hidden_size = 512

        # Get the vocabulary and its size
        self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True)
        self.vocab_size = len(self.vocab)

        # Initialize the encoder and decoder, and set each to inference mode
        self.encoder = EncoderCNN(self.embed_size)
        self.encoder.eval()
        self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size)
        self.decoder.eval()

        # Load the pre-trained weights
        self.encoder.load_state_dict(self.checkpoint['encoder'])
        self.decoder.load_state_dict(self.checkpoint['decoder'])
示例#2
0
 def __init__(self, args, vocab_len):
     super(BFM, self).__init__()
     self.encoder = EncoderCNN(args.embed_size).eval().cpu() 
     self.encoder.load_state_dict(torch.load('encoder.ckpt', map_location=torch.device('cpu')))
     self.decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_len, args.num_layers).eval().cpu() 
     self.decoder.forward = self.decoder.sample
     self.decoder.load_state_dict(torch.load('decoder.ckpt', map_location=torch.device('cpu')))
示例#3
0
def test(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(args.embed_size, len(vocab), args.hidden_size,
                         args.num_layers)

    # 加载训练好的模型的参数
    encoder.load_state_dict(torch.load(args.encoder_path, map_location='cpu'))
    decoder.load_state_dict(torch.load(args.decoder_path, map_location='cpu'))

    image = load_img(args.img_path, transform)

    feature = encoder(image)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    print(sentence)
    image = Image.open(args.img_path)
    plt.imshow(np.asarray(image))
    plt.show()
示例#4
0
  def __init__(self):
    print("Defining I.A")
    # Device configuration
    self.device = torch.device('cpu')

    #vars
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    vocab_path = 'data/vocab.pkl'

    # Image preprocessing
    self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    with open(vocab_path, 'rb') as f:
      self.vocab = pickle.load(f)

    print("Building Model")
    # Build models
    self.encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers)
    self.encoder = self.encoder.to(self.device)
    self.decoder = self.decoder.to(self.device)

    print("loading checkpoint")
    # Load the trained model parameters
    self.encoder.load_state_dict(torch.load(encoder_path))
    self.decoder.load_state_dict(torch.load(decoder_path))
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int,
                   hidden_size: int, from_cpu: bool) -> None:
    """
    Displays an original image from coco test dataset and prints its associated caption.

    encoder_file:   Name of the encoder to load.
    decoder_file:   Name of the decoder to load.
    embed_size:     Word embedding size for the encoder.
    hidden_size:    Hidden layer of the LSTM size.
    from_cpu:       Whether the model has been saved on CPU.
    """
    # Define transform
    transform_test = transforms.Compose([
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.ToTensor(),  # convert the PIL Image to a tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225))
    ])

    # Device to use fo inference
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create the data loader.
    data_loader = get_loader(transform=transform_test, mode='test')

    # Obtain sample image
    _, image = next(iter(data_loader))

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    if from_cpu:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file),
                       map_location='cpu'))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file),
                       map_location='cpu'))
    else:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    get_prediction(encoder, decoder, data_loader, device)
示例#6
0
class Neuraltalk2:

  def __init__(self):
    print("Defining I.A")
    # Device configuration
    self.device = torch.device('cpu')

    #vars
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    vocab_path = 'data/vocab.pkl'

    # Image preprocessing
    self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    with open(vocab_path, 'rb') as f:
      self.vocab = pickle.load(f)

    print("Building Model")
    # Build models
    self.encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers)
    self.encoder = self.encoder.to(self.device)
    self.decoder = self.decoder.to(self.device)

    print("loading checkpoint")
    # Load the trained model parameters
    self.encoder.load_state_dict(torch.load(encoder_path))
    self.decoder.load_state_dict(torch.load(decoder_path))

  def eval_image(self, image_path):
    # Prepare an image
    image = load_image(image_path, self.transform)
    image_tensor = image.to(self.device)
    
    # Generate an caption from the image
    feature = self.encoder(image_tensor)
    sampled_ids = self.decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
      word = self.vocab.idx2word[word_id]
      if word == '<end>':
        break
      if word == '<start>':
        continue
      sampled_caption.append(word)
        
    sentence = ' '.join(sampled_caption)
    return sentence
示例#7
0
def epoch_training(train_iter,
                   val_iter,
                   num_epoch=100,
                   learning_rate=1e-4,
                   hidden_size=100,
                   early_stop=False,
                   patience=2,
                   epsilon=1e-4):
    # define model
    encoder = EncoderRNN(input_size=len(EN.vocab), hidden_size=hidden_size)
    decoder = DecoderRNN(hidden_size=hidden_size, output_size=len(DE.vocab))

    # define loss criterion
    criterion = nn.NLLLoss(ignore_index=PAD_token)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    losses = np.ndarray(patience)

    res_loss = 13
    res_encoder = None
    res_decoder = None
    res_epoch = 0
    base_bleu = 0
    not_updated = 0

    for epoch in range(num_epoch):
        tl = train(train_iter, encoder, decoder, encoder_optimizer,
                   decoder_optimizer, criterion)
        loss, val_bleu = evaluate(val_iter, encoder, decoder, criterion)
        logging.warning('******Epoch: ' + str(epoch) + ' Training Loss: ' +
                        str(tl) + ' Validation Loss: ' + str(loss) +
                        ' Validation Bleu: ' + str(val_bleu) + '*********')
        #save the model with the lowest validation loss
        if base_bleu <= val_bleu:
            base_bleu = val_bleu
            res_loss = loss
            res_encoder = encoder
            res_decoder = decoder
            res_epoch = epoch
            not_updated = 0
            logging.warning('Updated validation loss as ' + str(res_loss) +
                            'With validation Bleu as ' + str(base_bleu) +
                            ' at epoch ' + str(res_epoch))
        else:
            not_updated += 1
        if not_updated == patience:
            break
    print('Stop at Epoch: ' + str(res_epoch) + ", With Validation Loss: " +
          str(res_loss) + ", Validation Bleu: " + str(base_bleu))
    logging.warning('Stop at Epoch: ' + str(res_epoch) +
                    ", With Validation Loss: " + str(res_loss) +
                    ", Validation Bleu: " + str(base_bleu))
    return res_loss, res_encoder, res_decoder, base_bleu
示例#8
0
class BFM(nn.Module):
    def __init__(self, args, vocab_len):
        super(BFM, self).__init__()
        self.encoder = EncoderCNN(args.embed_size).eval().cpu() 
        self.encoder.load_state_dict(torch.load('encoder.ckpt', map_location=torch.device('cpu')))
        self.decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_len, args.num_layers).eval().cpu() 
        self.decoder.forward = self.decoder.sample
        self.decoder.load_state_dict(torch.load('decoder.ckpt', map_location=torch.device('cpu')))
        
    def forward(self, image):
        feature = self.encoder(image)
        sampled_ids = self. decoder(feature)
        return sampled_ids
示例#9
0
def main():
    # Load vocabulary wrapper.
    with open(vocab_path) as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(4096, embed_dim)
    decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters())
    optimizer = torch.optim.Adam(params, lr=0.001)

    #load data
    with open(image_data_file) as f:
        image_data = pickle.load(f)
    image_features = si.loadmat(image_feature_file)

    img_features = image_features['fc7'][0]
    img_features = np.concatenate(img_features)

    print 'here'
    iteration = 0
    save_loss = []
    for i in range(10):  # epoch
        use_caption = i % 5
        print 'Epoch', i
        for x, y in make_mini_batch(img_features,
                                    image_data,
                                    use_caption=use_caption):
            word_padding, lengths = make_word_padding(y, vocab)

            x = Variable(torch.from_numpy(x).cuda())
            word_index = Variable(torch.from_numpy(word_padding).cuda())

            encoder.zero_grad()
            decoder.zero_grad()

            features = encoder(x)
            targets = pack_padded_sequence(word_index,
                                           lengths,
                                           batch_first=True)[0]
            outputs = decoder(features, word_index, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if iteration % 100 == 0:
                print 'loss', loss.data[0]
                save_loss.append(loss.data[0])

            iteration += 1

        torch.save(decoder.state_dict(), 'decoder.pkl')
        torch.save(encoder.state_dict(), 'encoder.pkl')
        with open('losses.txt', 'w') as f:
            print >> f, losses
示例#10
0
def main():
    ### load word embedding
    pickle_file = open(embedding_path, "rb")
    word_embedding = pickle.load(pickle_file)
    pickle_file.close()

    word_index = word_embedding[0]
    embedding_map = word_embedding[1]
    output_size = len(word_index)

    ### initialize model
    hidden_size = 100
    encoder = EncoderRNN(hidden_size)
    decoder = DecoderRNN(hidden_size, output_size)

    ### load train data
    parser = AcademicParser("../train_data/Academic_papers/docs.json")
    abstracts = parser.get_paperAbstract()
    titles = parser.get_title()
    assert (len(abstracts) == len(titles))

    ### prepare train data
    train_set = []
    for i in range(len(abstracts)):
        abstract = abstracts[i]
        title = titles[i]
        new_pair = variablesFromPair((abstract, title), word_index,
                                     embedding_map)
        if (len(new_pair[1]) > 0):
            train_set.append(new_pair)

    trainIters(encoder, decoder, 20000, train_set)
示例#11
0
def main():
    st.title('Image Captioning App')
    st.markdown(STYLE, unsafe_allow_html=True)

    file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"])
    show_file = st.empty()

    if not file:
        show_file.info("Please upload a file of type: " +
                       ", ".join(["png", "jpg", "jpeg"]))
        return

    content = file.getvalue()

    show_file.image(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl'
    decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl'

    embed_size = 300
    hidden_size = 256

    vocab_size, word2idx, idx2word = get_vocab()

    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
    decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

    encoder.to(device)
    decoder.to(device)

    transform_test = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    PIL_image = Image.open(file).convert('RGB')
    orig_image = np.array(PIL_image)
    image = transform_test(PIL_image)
    image = image.to(device).unsqueeze(0)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)

    sentence = clean_sentence(output, idx2word)
    st.info("Generated caption --> " + sentence)

    file.close()
示例#12
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([ 
        transforms.ToTensor(), 
        transforms.Normalize((0.033, 0.032, 0.033), 
                             (0.027, 0.027, 0.027))])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    len_vocab = vocab.idx 

    # Build Models
    encoder = ResNet(ResidualBlock, [3, 3, 3], len_vocab)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)

    decoder = DecoderRNN(len_vocab, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    attn_encoder = AttnEncoder(ResidualBlock, [3, 3, 3])
    attn_encoder.eval()
    attn_decoder = SANDecoder(args.feature_size, args.hidden_size, 
                         len(vocab), args.num_layers)

    # Load the trained model parameters
    attn_encoder.load_state_dict(torch.load(args.encoder_path))
    attn_decoder.load_state_dict(torch.load(args.decoder_path))


    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        attn_encoder.cuda(1)
        attn_decoder.cuda(1)
    
    # Generate caption from image
    feature = attn_encoder(image_tensor)
    sampled_ids = attn_decoder.sample(feature)
    ids_arr = []
    for element in sampled_ids: 
        temp = element.cpu().data.numpy()
        ids_arr.append(int(temp))


    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in ids_arr:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
示例#13
0
def main(args):

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    example = torch.rand(1, 3, 224, 224)
    encoder = EncoderCNN(args.embed_size).eval().cpu()
    encoder.load_state_dict(torch.load('encoder.ckpt'))
    traced_script_module = torch.jit.trace(encoder, example)
    traced_script_module.save("./encoder.pt")

    example = torch.rand(1, 256)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).eval().cpu()
    decoder.load_state_dict(torch.load('decoder.ckpt'))
    traced_script_module = torch.jit.trace(decoder, example)
    traced_script_module.save("./decoder.pt")
示例#14
0
def main(image):
    # Configuration for hyper-parameters
    config = Config()

    # Image Preprocessing
    transform = config.test_transform

    # Load vocabulary
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_cnn_path, config.trained_encoder)))
    decoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_lstm_path, config.trained_decoder)))
    # Prepare Image
    image = Image.open(image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)),
             Variable(torch.zeros(config.num_layers, 1, config.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word_id == 96:
            sampled_caption.append('<end>')
            break
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
    return sentence
示例#15
0
def main(args):
    # Val images folder
    filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014'
    onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))]

    # image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # load vocabulary wrapper pickle file
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    encoder.eval()  # evaluation mode by moving mean and variance
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    # load the trained CNN and RNN parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Load all images in val folder
    for i in onlyfiles:

        badsize = 0  # count the unload images
        args_image = filepath + '/'  # val folder path with image names
        args_image = args_image + i

        # transform image and wrap it to tensor
        image = load_image(args_image, transform)
        image_tensor = to_var(image, volatile=True)

        if torch.cuda.is_available():  # load GPU
            encoder.cuda()
            decoder.cuda()

            # generate caption from image
            try:
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()

                # decode word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # print out image and generated caption without start and end
                print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8])

            except:
                badsize = badsize + 1  # count some wrong images
示例#16
0
def main(args):
    vectore_dir = '/root/server/best_model/'

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    # encoder = EncoderCNN(args.embed_size)
    qvecs_pca = np.load(
        os.path.join(vectore_dir, "q_2{}.npy".format(args.embed_size)))
    # encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    # encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        # encoder.cuda()
        decoder.cuda()

    data = []
    # img_path = args.image
    # # Prepare Image
    # image = load_image(img_path, transform)
    # image_tensor = to_var(image, volatile=True)
    # Generate caption from image
    # feature = encoder(image_tensor)
    num = 29
    feature = torch.from_numpy(qvecs_pca[num:num + 1, :]).cuda()
    #pdb.set_trace()
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word == '<start>':
            continue
        if word == '<end>':
            break
        sampled_caption.append(word)
    sentence = ' '.join(sampled_caption)
    # Print out image and generated caption.
    print(sentence)
示例#17
0
class Annotator():
    def __init__(self):
        self.transform = transforms.Compose([ 
            transforms.Resize(256),                          # smaller edge of image resized to 256
            transforms.CenterCrop(224),                      # get 224x224 crop from the center
            transforms.ToTensor(),                           # convert the PIL Image to a tensor
            transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                                 (0.229, 0.224, 0.225))])
        
        # Load cherckpoint with best model
        self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu')
        # Specify values for embed_size and hidden_size - we use the same values as in training step
        self.embed_size = 512
        self.hidden_size = 512

        # Get the vocabulary and its size
        self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True)
        self.vocab_size = len(self.vocab)

        # Initialize the encoder and decoder, and set each to inference mode
        self.encoder = EncoderCNN(self.embed_size)
        self.encoder.eval()
        self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size)
        self.decoder.eval()

        # Load the pre-trained weights
        self.encoder.load_state_dict(self.checkpoint['encoder'])
        self.decoder.load_state_dict(self.checkpoint['decoder'])

        # Move models to GPU if CUDA is available.
        #if torch.cuda.is_available():
        #   encoder.cuda()
        #   decoder.cuda()

    def annotate(self, image):
        transformed = self.transform(image).unsqueeze(0)
        features = self.encoder(transformed).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        output = self.decoder.sample_beam_search(features)
        print('example output:', output)
        sentence = clean_sentence(output[0], self.vocab)
        print('example sentence:', sentence)
        return sentence
示例#18
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    alexnet = models.alexnet(pretrained=True)
    alexnet2 = AlexNet2(alexnet)
    # Build Models
    encoder = EncoderCNN(4096, args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
             Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        alexnet2.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    alexnet2(image_tensor)
    feature = encoder(alexnet2.fc7_value)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
示例#19
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
示例#20
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    sentence = sentence.replace('<start> ',
                                '').replace(' <end>', '').replace('.',
                                                                  '').strip()
    translator = Translator()
    sentence_indo = translator.translate(sentence, dest='id').text
    print('This is an image of: ' + sentence_indo)
    tts = gTTS(sentence_indo, 'id')
    tts.save('result.mp3')
    playsound('result.mp3')

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    plt.show()
示例#21
0
    def setUpClass(cls):
        cls.pre_processing = PreProcessing(sentences)
        cls.dataset = ds.process(cls.pre_processing)
        cls.word_embedding = WordEmbedding(source=cls.dataset.pairs)

        encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device)
        decoder = DecoderRNN(300, cls.word_embedding, 0.0,
                             1).to(settings.device)
        cls.model = Model(encoder, decoder)
        cls.model.train(cls.dataset)
示例#22
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    data = []
    try:
        img_path = args.image
        # Prepare Image
        image = load_image(img_path, transform)
        image_tensor = to_var(image, volatile=True)
        # Generate caption from image
        feature = encoder(image_tensor)
        #pdb.set_trace()
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        # Print out image and generated caption.
        print(sentence)
        data.append({'key': img_path.split('/')[-1], 'sentence': sentence})
    except:
        print(img_path)
示例#23
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args.image
    images = os.listdir(image_dir)
    for image_id in images:
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = to_var(image, volatile=True)
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
            continue
        
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out image and generated caption.
        print (image_id + '\t' + sentence)
示例#24
0
def main(args):
	# Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    # encoder = EncoderCNN(args.embed_size)
    # encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    layout_encoder.load_state_dict(torch.load(args.layout_encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    
    # If use gpu
    if torch.cuda.is_available():
        layout_encoder.cuda()
        decoder.cuda()

    # validation(layout_encoder,decoder, args,vocab,transform,args.batch_size)
    out = save_output(layout_encoder,decoder, args,vocab,transform,args.batch_size)
    with open('bsl_output.txt', 'w') as outfile:
        json.dump(out, outfile)
示例#25
0
文件: test.py 项目: qiz2/dl_project
def main(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    total_step = len(data_loader)

    # List to score the BLEU scores
    bleu_scores = []

    for i, (images, captions, lengths) in enumerate(data_loader):
        
        # Set mini-batch dataset
        images = images.to(device)
        # captions = captions.to(device)
        
        # Generate an caption from the image
        feature = encoder(images)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        score = sentence_bleu(captions, sentence, args.bleu_weights)
        bleu_scores.append(score)

        # Print log info
        if i % args.log_step == 0:
            print('Finish [{}/{}], Current BLEU Score: {:.4f}'
                  .format(i, total_step, np.mean(bleu_scores)))

    np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
    def getCaption(self,
                   imgs,
                   output_path='',
                   vocab_path='data/vocab.pkl',
                   decoder_path='models/decoder-5-3000.pkl',
                   encoder_path='models/encoder-5-3000.pkl',
                   embed_size=256,
                   hidden_size=512,
                   num_layers=1):
        if (output_path == ''):
            output_path = self.DEFAULT_OUTPUT_PATH
        device = self.device
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))

        CAPTIONS = []

        for img in imgs:
            # Prepare an image
            image = self.load_image(img, transform=transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy(
            )  # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption
            CAPTIONS.append(self.prune_caption(sentence))

        json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path)

        return json_captions
示例#27
0
def main(args):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)

        with open(args.filepath, 'w+') as f:
            pickle.dump((scores, scores_u), f)
示例#28
0
def main():
    args = parse_arguments()
    hidden_size = 300
    embed_size = 50
    kld_weight = 0.05
    temperature = 0.9
    use_cuda = torch.cuda.is_available()

    print("[!] preparing dataset...")
    TEXT = data.Field(lower=True, fix_length=30)
    LABEL = data.Field(sequential=False)
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, max_size=250000)
    LABEL.build_vocab(train_data)
    train_iter, test_iter = data.BucketIterator.splits(
        (train_data, test_data), batch_size=args.batch_size, repeat=False)
    vocab_size = len(TEXT.vocab) + 2

    print("[!] Instantiating models...")
    encoder = EncoderRNN(vocab_size,
                         hidden_size,
                         embed_size,
                         n_layers=2,
                         dropout=0.5,
                         use_cuda=use_cuda)
    decoder = DecoderRNN(embed_size,
                         hidden_size,
                         vocab_size,
                         n_layers=2,
                         dropout=0.5,
                         use_cuda=use_cuda)
    vae = VAE(encoder, decoder)
    optimizer = optim.Adam(vae.parameters(), lr=args.lr)
    if use_cuda:
        print("[!] Using CUDA...")
        vae.cuda()

    best_val_loss = None
    for e in range(1, args.epochs + 1):
        train(e, vae, optimizer, train_iter, vocab_size, kld_weight,
              temperature, args.grad_clip, use_cuda, TEXT)
        val_loss = evaluate(vae, test_iter, vocab_size, kld_weight, use_cuda)
        print("[Epoch: %d] val_loss:%5.3f | val_pp:%5.2fS" %
              (e, val_loss, math.exp(val_loss)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            print("[!] saving model...")
            if not os.path.isdir("snapshot"):
                os.makedirs("snapshot")
            torch.save(vae.state_dict(), './snapshot/vae_{}.pt'.format(e))
            best_val_loss = val_loss
示例#29
0
def run_inference(image_path,
                  encoder_path,
                  decoder_path,
                  vocab_path,
                  embed_size=256,
                  hidden_size=512,
                  num_layers=1):
    print(f'sample.py running ... ')
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        print("using " + vocab_path)
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(encoder_path, map_location=torch.device('cpu')))
    decoder.load_state_dict(
        torch.load(decoder_path, map_location=torch.device('cpu')))

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        print(word)
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).replace('<start>', '')
    sentence = sentence.replace('<end>', '')
    sentence = sentence.replace('_', ' ')

    # Print out the image and the generated caption
    print(sentence)

    print(f'debug: chay xong roi ne')
    return sentence.strip().capitalize()
示例#30
0
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    vocab = Vocabulary.load_vocab(args['data_dir'])
    args['vocab_size'] = len(vocab)
    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args)
    encoder.to(device)
    decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['encoder_name'])))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['decoder_name'])))

    test_caption_list = []
    for file_name in os.listdir(
            os.path.join(args['data_dir'], args['image_dir'])):
        if os.path.isfile(
                os.path.join(args['data_dir'], args['image_dir'], file_name)):
            image = load_image(
                os.path.join(args['data_dir'], args['image_dir'], file_name),
                transform)
            image_tensor = image.to(device)
        else:
            continue

        feature = encoder(image_tensor)
        sample_ids = decoder.sample(feature)
        sample_ids = sample_ids[0].cpu().numpy()

        sample_caption = []
        for word_id in sample_ids:
            word = vocab.idx2word[word_id]
            sample_caption.append(word)
            if word == '<end>':
                break

        sentence = ' '.join(sample_caption)
        print(sentence)
        test_caption_list.append((file_name, sentence))


#        image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name))
#        plt.imshow(np.asarray(image))

    with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f:
        for item in test_caption_list:
            f.write('image_name:{} ---- generated_caption:{}\n'.format(
                item[0], item[1]))
            f.write('\n')
    def get_caption(self, img_tensor):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(device)
        print("running")

        # Models
        encoder_file = 'legit_model/encoder_1.pkl'
        decoder_file = 'legit_model/decoder_1.pkl'

        # Embed and hidden
        embed_size = 512
        hidden_size = 512

        # The size of the vocabulary.
        vocab_size = 8856

        # Initialize the encoder and decoder, and set each to inference mode.
        encoder = EncoderCNN(embed_size)
        encoder.eval()

        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
        decoder.eval()

        # Load the trained weights.
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

        # Move models to GPU if CUDA is available.
        encoder.to(device)
        decoder.to(device)

        img_d = img_tensor.to(device)

        # Obtain the embedded image features.
        features = encoder(img_d).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        img_output = decoder.sample(features)

        sentence = self.clean_sentence(img_output)

        return sentence
示例#32
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
示例#33
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
示例#34
0
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc=COCOPATH)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) +\
         list(encoder.embed.parameters())  # We don't want to retrain the resnet

# TODO #4: Define the optimizer.
optimizer = torch.optim.RMSprop(params)
示例#35
0
# Build data loader.
data_loader = get_loader(transform=transform_test,
                         mode='test_small',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc=COCOPATH)

vocab = data_loader.dataset.vocab
# The size of the vocabulary.
vocab_size = len(vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cpu")
# encoder.to(device)
# decoder.to(device)

# Load the pretrained model
encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder')))
decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder')))

encoder.eval()
decoder.eval()

images, conv_images = next(iter(data_loader))
features = encoder(conv_images).unsqueeze(1)
output = decoder.sample(features, max_len=max_len)
示例#36
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))