image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=50, load_in_tmp=False)

    # Pipeline for text
    text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy',
                           remove_punctuation=True, nopoints=False)

    # Create the dataset
    dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder)
    train_dataset, val_dataset, test_dataset = dataset.splits

    if not os.path.isfile('vocab_%s.pkl' % args.exp_name):
        print("Building vocabulary")
        text_field.build_vocab(train_dataset, val_dataset, min_freq=5)
        pickle.dump(text_field.vocab, open('vocab_%s.pkl' % args.exp_name, 'wb'))
    else:
        text_field.vocab = pickle.load(open('vocab_%s.pkl' % args.exp_name, 'rb'))

    # Model and dataloaders
    encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory,
                                     attention_module_kwargs={'m': args.m})
    decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'])
    model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device)

    dict_dataset_train = train_dataset.image_dictionary({'image': image_field, 'text': RawField()})
    ref_caps_train = list(train_dataset.text)
    cider_train = Cider(PTBTokenizer.tokenize(ref_caps_train))
    dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()})
    dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()})


    def lambda_lr(s):
                                         load_in_tmp=False)

    # Pipeline for text
    text_field = TextField(init_token='<bos>',
                           eos_token='<eos>',
                           lower=True,
                           tokenize='spacy',
                           remove_punctuation=True,
                           nopoints=False)

    # Create the dataset
    dataset = ScanNet(image_field, text_field,
                      "/cluster/sorona/dchen/ScanNet_frames/",
                      get_image_ids(args.features_path))
    _, _, test_dataset = dataset.splits
    text_field.vocab = pickle.load(open('vocab.pkl', 'rb'))

    # Model and dataloaders
    encoder = MemoryAugmentedEncoder(
        3,
        0,
        attention_module=ScaledDotProductAttentionMemory,
        attention_module_kwargs={'m': 40})
    decoder = MeshedDecoder(len(text_field.vocab), 54, 3,
                            text_field.vocab.stoi['<pad>'])
    model = Transformer(text_field.vocab.stoi['<bos>'], encoder,
                        decoder).to(device)

    data = torch.load('meshed_memory_transformer.pth')
    model.load_state_dict(data['state_dict'])
예제 #3
0
        lower=True,
        tokenize="spacy",
        remove_punctuation=True,
        nopoints=False,
    )

    # Create the dataset
    dataset = COCO(
        image_field,
        text_field,
        "coco/images/",
        args.annotation_folder,
        args.annotation_folder,
    )
    _, _, test_dataset = dataset.splits
    text_field.vocab = pickle.load(open("vocab.pkl", "rb"))

    # Model and dataloaders
    encoder = MemoryAugmentedEncoder(
        3,
        0,
        attention_module=ScaledDotProductAttentionMemory,
        attention_module_kwargs={"m": 40},
    )
    decoder = MeshedDecoder(len(text_field.vocab), 54, 3,
                            text_field.vocab.stoi["<pad>"])
    model = Transformer(text_field.vocab.stoi["<bos>"], encoder,
                        decoder).to(device)

    data = torch.load("meshed_memory_transformer.pth")
    model.load_state_dict(data["state_dict"])
    parser.add_argument('--d_in', type=int, default=2048)
    parser.add_argument('--vocab', type=str, default='vocab.pkl')
    args = parser.parse_args()

    print('Meshed-Memory Transformer Evaluation')

    # Pipeline for image regions
    image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=50, load_in_tmp=False)

    # Pipeline for text
    text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False)

    # Create the dataset
    dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder)
    _, _, test_dataset = dataset.splits
    text_field.vocab = pickle.load(open(args.vocab, 'rb'))

    # Model and dataloaders
    encoder = MemoryAugmentedEncoder(3, 0, args.d_in, d_ff=args.d_in, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': 40})
    decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'], d_ff=args.d_in)
    model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device)

    data = torch.load(args.weights)
    model.load_state_dict(data['state_dict'])

    dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()})
    dict_dataloader_test = DataLoader(dict_dataset_test, batch_size=args.batch_size, num_workers=args.workers)

    scores = predict_captions(model, dict_dataloader_test, text_field)
    print(scores)