示例#1
0
def main(loader, vocab, opt, model=None):
    if model is None:
        vocab_size = len(vocab)
        model = MultimodalAtt(vocab_size, opt['max_len'], opt['dim_hidden'],
                              opt['dim_word'])

        model = nn.DataParallel(model)

        if opt['beam']:
            bw = opt['beam_size']
            print(f'Using beam search with beam width = {bw}')
        model_path = opt['checkpoint_path']
        for i in os.listdir(model_path):
            if i.endswith('.pth'):
                print(i)
                path = os.path.join(model_path, i)
                model.load_state_dict(torch.load(path))
                crit = NLUtils.LanguageModelCriterion()

                eval(model, crit, loader, vocab, opt)
    else:
        '''
        Running from inside train.py
        '''
        crit = NLUtils.LanguageModelCriterion()
        scores = eval(model, crit, loader, vocab, opt)
        return scores
def main():
    # video_path = input('en In the Shell you should seter the path to the video:')
    video_path = '../video9295.mp4'
    # model_path = input('enter the model path: ')
    model_path = 'save/vanilla/model_2190.pth'
    wav_path = vToA(video_path)
    audio_mfcc = split_audio(wav_path)
    audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    image_feats = extract_image_feats(video_path)
    image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)
    print('generating caption...')
    model = MultimodalAtt(16860, 28, 1024, 512, rnn_dropout_p=0)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    opt = dict()
    opt['child_sum'] = True
    opt['temporal_attention'] = True
    opt['multimodel_attention'] = True
    with torch.no_grad():
        _, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt)
    vocab = json.load(open('data/info.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
示例#3
0
def main():
    # video_path = input('en In the Shell you should seter the path to the video:')
    #video_path = '/home/cxu-serve/p1/ytian21/dat/msrvtt_2017/train-video/video9295.mp4'
    # model_path = input('enter the model path: ')
    model_path = 'save_new/model_225.pth'
    #wav_path = vToA(video_path)
    #wav_path = '/home/cxu-serve/p1/rohan27/research/audiocaps/code2/audios_msrvtt/video9295.wav'
    #audio_mfcc = split_audio(wav_path)
    #audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    #image_feats = extract_image_feats(video_path)
    #image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)
    print('generating caption...')
    feat_dir = './audios_msrvtt/features'
    video_id = 9292
    c4_dir = os.path.join(feat_dir, 'conv4', f'video{video_id}.npy')
    fc2_dir = os.path.join(feat_dir, 'fc2', f'video{video_id}.npy')
    c4_feat = torch.from_numpy(np.load(c4_dir)).type(torch.FloatTensor)
    fc2_feat = torch.from_numpy(np.load(fc2_dir)).type(torch.FloatTensor)
    c4_feat.unsqueeze_(0)
    fc2_feat.unsqueeze_(0)
    model = MultimodalAtt(16860, 28, 512, 300, rnn_dropout_p=0)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    opt = dict()
    with torch.no_grad():
        _, seq_preds = model(c4_feat, fc2_feat, mode='inference', opt=opt)
    vocab = json.load(open('data/info_new.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
示例#4
0
def eval(model, crit, dataset, vocab, opt, model_path):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        image_feats = data['image_feats'].cuda()
        audio_mfcc = data['audio_mfcc'].cuda()
        video_ids = data['video_ids']
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt'
    with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table:
        scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
示例#5
0
def eval(model, crit, loader, vocab, opt):
    model.eval()
    scorer = COCOScorer()
    ip_json = open(opt['input_json'])
    gt_dataframe = json_normalize(json.load(ip_json)['sentences'])
    ip_json.close()
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_fc2 = data['audio_fc2'].cuda()
        video_feat = data['video_feat'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_fc2,
                                         video_feat,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    return valid_score
def main(opt):
    video_path = opt["video_path"]

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    image_feats = extract_image_feats(video_path)
    image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)

    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                            input_dropout_p=opt["input_dropout_p"],
                            rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder).cuda()

    model.load_state_dict(torch.load(opt["saved_model"]))
    model.eval()
    opt = dict()
    opt['child_sum'] = True
    opt['temporal_attention'] = True
    opt['multimodel_attention'] = True
    with torch.no_grad():
        _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
    vocab = json.load(open('data/info.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
示例#7
0
def main(opt):
    dataset = VideoAudioDataset(opt, 'val')
    opt['vocab_size'] = dataset.get_vocab_size()
    model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'],
    n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt['rnn_dropout_p']).cuda()
    model = nn.DataParallel(model)
    crit = NLUtils.LanguageModelCriterion()
    for model_path in tqdm(glob.glob(os.path.join(opt['model_directory'],'*.pth'))):
        model.load_state_dict(torch.load(model_path))
        eval(model, crit, dataset, dataset.get_vocab(), opt, model_path)
示例#8
0
def eval(model, crit, loader, vocab, opt):
    model.eval()
    '''
    if opt['beam']:
        bs = 1
    else:
        bs = opt['batch_size']
    loader = DataLoader(dataset, batch_size=bs, shuffle=True)
    '''
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_conv4 = data['audio_conv4'].cuda()
        audio_fc2 = data['audio_fc2'].cuda()
        sem_feats = data['sem_feats'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_conv4,
                                         audio_fc2,
                                         sem_feats,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])
    '''
    with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(os.path.join(opt["results_path"],
                           'vanilla' + ".json"), 'w') as prediction_results:
        json.dump({"predictions": samples, "scores": valid_score},
                  prediction_results)
    '''
    return valid_score
示例#9
0
def main(opt):
    dataset = VideoAudioDataset(opt, 'test')
    opt['vocab_size'] = dataset.get_vocab_size()
    model = MultimodalAtt(opt['vocab_size'],
                          opt['max_len'],
                          opt['dim_hidden'],
                          opt['dim_word'],
                          dim_vid=opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt['rnn_dropout_p']).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt['model_path']))
    crit = NLUtils.LanguageModelCriterion()

    eval(model, crit, dataset, dataset.get_vocab(), opt)
    def main(self, opt):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        video_path = self.ent1.get().replace("/", "\\")
        image_feats = self.extract_image_feats(video_path)
        image_feats = torch.from_numpy(image_feats).type(
            torch.FloatTensor).unsqueeze(0)

        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(16860,
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=bool(opt["bidirectional"]))
        model = S2VTAttModel(encoder, decoder).cuda()
        model.load_state_dict(torch.load("data/save/model_500.pth"))
        model.eval()
        opt = dict()
        opt['child_sum'] = True
        opt['temporal_attention'] = True
        opt['multimodel_attention'] = True
        with torch.no_grad():
            _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
        vocab = json.load(open('data/info.json'))['ix_to_word']
        self.sent = NLUtils.decode_sequence(vocab, seq_preds)
        hasil = self.translator.translate(self.sent[0], dest='id')
        print(self.sent[0])
        self.hasilPred.configure(text=self.sent[0])
        self.hasiltrans.configure(text=hasil.text)
        # coba = self.sent[0]
        self.textToSpeech(self.sent[0], hasil.text)
        del seq_preds
        torch.cuda.empty_cache()