Пример #1
0
def directory_to_sentence_matrix(directory):
    sentence_dict = {}
    for j in range(1, 7):
        filename = directory + "/" + str(j) + ".jpg"
        checkpoint = torch.load(
            '/home/yerlan/HackNU/a-PyTorch-Tutorial-to-Image-Captioning/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar'
        )
        decoder = checkpoint['decoder']
        decoder = decoder.to(device)
        decoder.eval()
        encoder = checkpoint['encoder']
        encoder = encoder.to(device)
        encoder.eval()

        # Load word map (word2ix)
        with open(
                '/home/yerlan/HackNU/a-PyTorch-Tutorial-to-Image-Captioning/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json',
                'r') as t:
            word_map = json.load(t)
        rev_word_map = {v: k for k, v in word_map.items()}  # ix2word

        sentence_array = []
        # Encode, decode with 0attention and beam search
        for i in range(1, 6):
            seq, alphas = caption.caption_image_beam_search(
                encoder, decoder, filename, word_map, i)
            alphas = torch.FloatTensor(alphas)

            # Visualize caption and attention of best sequence
            sentence_array.append(
                caption.return_sentence(filename, seq, alphas, rev_word_map))
        sentence_dict[j] = sentence_array

    return sentence_dict
Пример #2
0
def capting():
    try:
        img_obj = request.files['picture']
    except:
        report(traceback.format_exc())
        logging.exception('Error with image upload')
        return 'Error with image upload', 500
    try:
        beam_arg = request.args['beam_size']
        #beam = request.files['beam_size']
        assert 0 < int(beam_arg) < 10
        beam = int(beam_arg)
    except:
        report(traceback.format_exc())
        logging.exception('Invalid beam input')
        beam = 5
    seq, alphas = caption.caption_image_beam_search(encoder,
                                                    decoder,
                                                    img_obj,
                                                    word_map,
                                                    beam_size=beam)
    # seq is a list of numbers
    try:
        words = [rev_word_map[ind] for ind in seq]
    except:
        report(traceback.format_exc())
        return 'can not get word from seq', 500
    return jsonify(words)
def caption(image_path, args):    
    if not os.path.exists(args['model']) or not os.path.exists(args['word_map']):
        print('Pretrained model files not found.\n', args)
        return None

    # Load model
    checkpoint = torch.load(args['model'], map_location=torch.device('cpu'))
    decoder = checkpoint['decoder']
    decoder = decoder.to(device)
    decoder.eval()
    encoder = checkpoint['encoder']
    encoder = encoder.to(device)
    encoder.eval()
    
    # Load word map (word2idx)
    with open(args['word_map'], 'r') as j:
        word_map = json.load(j)
        
    # idx2word
    rev_word_map = {v: k for k, v in word_map.items()}  
    

    seq, alphas = caption_image_beam_search(encoder, decoder, image_path, word_map, args['beam'])
    
    sampled_caption = [rev_word_map[ind] for ind in seq]
    sampled_caption = []
    for ind in seq[1:]:
        word = rev_word_map[ind]
        if word == '<end>':
            break
        sampled_caption.append(word)
        
    sentence = ' '.join(sampled_caption)

    return sentence
Пример #4
0
 def getDescription(self, img_path, beam_size=5):
     # Encode, decode with attention and beam search
     seq, alphas = caption_image_beam_search(self.encoder, self.decoder,
                                             img_path, self.word_map,
                                             beam_size)
     alphas = torch.FloatTensor(alphas)
     #Final predicted sentence
     words = [self.rev_word_map[ind] for ind in seq]
     return words
Пример #5
0
def main():
    print("Initializing...")
    config = Config()
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(config.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded")

    #Build models
    encoder = Encoder().eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = Decoder(vocab_size=len(vocab),
                      use_glove=False,
                      use_bert=config.bert_model,
                      vocab=vocab,
                      device=device,
                      BertTokenizer=tokenizer,
                      BertModel=BertModel)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    print("Model built")
    encoder_path = config.encoder_path
    decoder_path = config.decoder_path
    encoder.load_state_dict(torch.load(encoder_path), strict=False)
    decoder.load_state_dict(torch.load(decoder_path), strict=False)

    print("Model loaded")
    images = get_val_images(config.validation_path)
    print(f"Length of images: {len(images)}")

    print("Validation file loaded")
    results_data = []
    curr_id = 0

    for index, image_data in enumerate(images):
        try:
            print(f"Index: {index}")
            image_path = config.val_img_path + image_data['file_name']
            image = load_image(image_path, transform=transform)
            image_tensor = image.to(device)

            caption_idx, _ = caption_image_beam_search(encoder=encoder,
                                                       decoder=decoder,
                                                       word_map=vocab.word2idx,
                                                       image=image_tensor,
                                                       device=device)
            print(f"Caption index: {caption_idx}")
        except Exception as e:
            print(e)
            pass
Пример #6
0
def captionGen():
    response = request.files
    image = np.array(Image.open(response['image']))
    seq, alphas = caption_image_beam_search(encoder, decoder, image, word_map,
                                            5)
    words = [rev_word_map[ind] for ind in seq]
    output = ''
    for i in words[1:-1]:
        output += ' {}'.format(i)
    return output
Пример #7
0
def predict():
    # beam = None

    try:
        img_obj = request.files['picture']
    except:
        report(traceback.format_exc())
        logging.exception('Error with image upload')
        return 'Error with image upload', 500
    try:
        beam_arg = request.args['beam_size']
        #beam = request.files['beam_size']
        assert 0 < int(beam_arg) < 10
        beam = int(beam_arg)
    except:
        report(traceback.format_exc())
        logging.exception('Invalid beam input')
        beam = 5

    try:
        translate_api = request.args['translate_api']
    except:
        report(traceback.format_exc())
        logging.exception(
            'no translator api specified, using the one in the conf file')
    start = time.time()

    seq, alphas = caption.caption_image_beam_search(encoder,
                                                    decoder,
                                                    img_obj,
                                                    word_map,
                                                    beam_size=beam)
    end = time.time()
    cap_elapse = start - end
    print('caption used ', cap_elapse, 'seconds')
    # seq is a list of numbers
    try:
        words = [rev_word_map[ind] for ind in seq]
    except:
        report(traceback.format_exc())
        return 'can not get word from seq', 500
    # words is a list of string
    try:
        start = time.time()
        r = translate(words, translate_api)
        end = time.time()
        trans_elapse = start - end
        print('translate used:', trans_elapse)
    except:
        report(traceback.format_exc())
        return 'translate failed', 500
    if r.status_code == 500:
        return 'translation server give 500'

    return r.text
def run_samples(encoder, decoder, fs, n, path_prefix, word_map, rev_word_map):
    all_chosen = np.random.choice(len(fs), n)
    for i in all_chosen:
        f = fs[i]
        # Encode, decode with attention and beam search
        seq, alphas = caption_image_beam_search(encoder, decoder, f, word_map,
                                                5)
        alphas = torch.FloatTensor(alphas)

        # Visualize caption and attention of best sequence
        visualize_att(f, seq, alphas, rev_word_map,
                      f'{path_prefix}_{i}_result.png')
Пример #9
0
def hello_world():
    print(request.files['file'])
    file = request.files['file']
    if file:
        filename = secure_filename(file.filename)
        file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

    img_path = './uploads' + '/' + file.filename
    #print(img_path)
    seq, alphas = caption_image_beam_search(encoder, decoder, img_path,
                                            word_map, 1)
    words = [rev_word_map[ind] for ind in seq]
    words = words[1:]
    words = words[:len(words) - 1]
    words = ' '.join(words)
    print(words)
    f = open("./templates/result.html", "w")
    f.write(words)
    return words
Пример #10
0
def main_image_caption(image_file, beam_size=5, dont_smooth=''):
    """
    图片描述生成,调用脚本
    :param image_file: 本地文件的uri,或者图片的url
    :return:
    """
    logging.info('get a new query\t{}'.format(image_file))
    res = {'en': '', 'zh': ''}
    # Encode, decode with attention and beam search
    seq, alphas = caption.caption_image_beam_search(encoder, decoder, image_file, word_map, beam_size)
    alphas = torch.FloatTensor(alphas)
    # 打印image2text结果,翻译成中文
    words_l = [rev_word_map[ind] for ind in seq]
    en_words = ' '.join(words_l[1: -1])
    bdt = baidu_translate.BaiDuTranslate()
    zh_word_dict = bdt.translate(en_words, 'en', 'zh')
    zh_words = zh_word_dict['trans_result'][0]['dst']
    res['en'] = en_words
    res['zh'] = zh_words
    logging.info('return a caption\t'.format(json.dumps(res, ensure_ascii=False)))
    return res
def infer_caption_by_master(img_path, json_path, model, vocab_path,
                            prediction_path, id2class_path):
    """

    :param img_path:
    :param json_path:
    :param model:
    :param vocab_path:
    :param prediction_path:
    :param id2class_path:
    :return:
    """

    model = '/home/dexter/show_attend_tell/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar'

    # Load model
    checkpoint = torch.load(model)
    decoder = checkpoint['decoder']
    decoder = decoder.to(device)
    decoder.eval()
    encoder = checkpoint['encoder']
    encoder = encoder.to(device)
    encoder.eval()

    word_map = '/home/dexter/show_attend_tell/caption data/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json'

    # Load word map (word2ix)
    with open(word_map, 'r') as j:
        word_map = json.load(j)
    rev_word_map = {v: k for k, v in word_map.items()}  # ix2word

    annotation_path = json_path
    with open(annotation_path) as json_file:
        data = json.load(json_file)

    images = data['images']

    # Prediction for every class
    prediction = []
    # Prediction splitted by class
    prediction_class = {}

    img_num = len(images)
    img_gray_num = 0
    for i, img in enumerate(images, 1):
        image_id = img['id']
        path = img_path + img['file_name']
        _, _, sentence = caption.caption_image_beam_search(rev_word_map,
                                                           encoder,
                                                           decoder,
                                                           path,
                                                           word_map,
                                                           beam_size=3)
        entry = {}
        entry['image_id'] = image_id
        entry['caption'] = sentence
        prediction.append(entry)
        if i % 100 == 0:
            print("Inferred on {}/{} images on test set".format(i, img_num))

    return prediction
Пример #12
0
def validate(val_loader, encoder, decoder, criterion):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # Batches
    for i, (img, caps, caplen) in enumerate(val_loader):

        seq, alphas = caption_image_beam_search(encoder, decoder, img,
                                                word_map, beam_size)

        if i % (args.log_step / 10) == 0:
            print('Validation: [{0}/{1}]\t'.format(i, len(val_loader)))

        # References
        # caps = caps[sort_ind]  # because images were sorted in the decoder
        img_caps = caps[0].tolist()

        # img_captions = list(
        #     map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
        #         img_caps))  # remove <start> and pads
        img_captions = list(
            map(
                lambda c: [
                    w for w in c if w not in {
                        word_map('<start>'),
                        word_map('<end>'),
                        word_map('<pad>')
                    }
                ], [img_caps]))  # remove <start> and pads

        references.append(img_captions)

        # Hypotheses
        hypotheses.append([
            w for w in seq if w not in
            {word_map('<start>'),
             word_map('<end>'),
             word_map('<pad>')}
        ])

        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses, emulate_multibleu=True)

    print('\n * BLEU-4 - {bleu}\n'.format(bleu=bleu4))

    return bleu4