def main(loader, vocab, opt, model=None): if model is None: vocab_size = len(vocab) model = MultimodalAtt(vocab_size, opt['max_len'], opt['dim_hidden'], opt['dim_word']) model = nn.DataParallel(model) if opt['beam']: bw = opt['beam_size'] print(f'Using beam search with beam width = {bw}') model_path = opt['checkpoint_path'] for i in os.listdir(model_path): if i.endswith('.pth'): print(i) path = os.path.join(model_path, i) model.load_state_dict(torch.load(path)) crit = NLUtils.LanguageModelCriterion() eval(model, crit, loader, vocab, opt) else: ''' Running from inside train.py ''' crit = NLUtils.LanguageModelCriterion() scores = eval(model, crit, loader, vocab, opt) return scores
def main(): # video_path = input('en In the Shell you should seter the path to the video:') video_path = '../video9295.mp4' # model_path = input('enter the model path: ') model_path = 'save/vanilla/model_2190.pth' wav_path = vToA(video_path) audio_mfcc = split_audio(wav_path) audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0) os.environ['CUDA_VISIBLE_DEVICES'] = '0' image_feats = extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) print('generating caption...') model = MultimodalAtt(16860, 28, 1024, 512, rnn_dropout_p=0) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(): # video_path = input('en In the Shell you should seter the path to the video:') #video_path = '/home/cxu-serve/p1/ytian21/dat/msrvtt_2017/train-video/video9295.mp4' # model_path = input('enter the model path: ') model_path = 'save_new/model_225.pth' #wav_path = vToA(video_path) #wav_path = '/home/cxu-serve/p1/rohan27/research/audiocaps/code2/audios_msrvtt/video9295.wav' #audio_mfcc = split_audio(wav_path) #audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0) os.environ['CUDA_VISIBLE_DEVICES'] = '0' #image_feats = extract_image_feats(video_path) #image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) print('generating caption...') feat_dir = './audios_msrvtt/features' video_id = 9292 c4_dir = os.path.join(feat_dir, 'conv4', f'video{video_id}.npy') fc2_dir = os.path.join(feat_dir, 'fc2', f'video{video_id}.npy') c4_feat = torch.from_numpy(np.load(c4_dir)).type(torch.FloatTensor) fc2_feat = torch.from_numpy(np.load(fc2_dir)).type(torch.FloatTensor) c4_feat.unsqueeze_(0) fc2_feat.unsqueeze_(0) model = MultimodalAtt(16860, 28, 512, 300, rnn_dropout_p=0) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model.eval() opt = dict() with torch.no_grad(): _, seq_preds = model(c4_feat, fc2_feat, mode='inference', opt=opt) vocab = json.load(open('data/info_new.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def eval(model, crit, dataset, vocab, opt, model_path): model.eval() loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in loader: # forward the model to get loss image_feats = data['image_feats'].cuda() audio_mfcc = data['audio_mfcc'].cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt' with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table: scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
def eval(model, crit, loader, vocab, opt): model.eval() scorer = COCOScorer() ip_json = open(opt['input_json']) gt_dataframe = json_normalize(json.load(ip_json)['sentences']) ip_json.close() gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_fc2 = data['audio_fc2'].cuda() video_feat = data['video_feat'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_fc2, video_feat, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) return valid_score
def main(opt): video_path = opt["video_path"] os.environ['CUDA_VISIBLE_DEVICES'] = '0' image_feats = extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load(opt["saved_model"])) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(opt): dataset = VideoAudioDataset(opt, 'val') opt['vocab_size'] = dataset.get_vocab_size() model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt['rnn_dropout_p']).cuda() model = nn.DataParallel(model) crit = NLUtils.LanguageModelCriterion() for model_path in tqdm(glob.glob(os.path.join(opt['model_directory'],'*.pth'))): model.load_state_dict(torch.load(model_path)) eval(model, crit, dataset, dataset.get_vocab(), opt, model_path)
def eval(model, crit, loader, vocab, opt): model.eval() ''' if opt['beam']: bs = 1 else: bs = opt['batch_size'] loader = DataLoader(dataset, batch_size=bs, shuffle=True) ''' scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_conv4 = data['audio_conv4'].cuda() audio_fc2 = data['audio_fc2'].cuda() sem_feats = data['sem_feats'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_conv4, audio_fc2, sem_feats, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) ''' with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(os.path.join(opt["results_path"], 'vanilla' + ".json"), 'w') as prediction_results: json.dump({"predictions": samples, "scores": valid_score}, prediction_results) ''' return valid_score
def main(opt): dataset = VideoAudioDataset(opt, 'test') opt['vocab_size'] = dataset.get_vocab_size() model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'], n_layers=opt['num_layers'], rnn_dropout_p=opt['rnn_dropout_p']).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(opt['model_path'])) crit = NLUtils.LanguageModelCriterion() eval(model, crit, dataset, dataset.get_vocab(), opt)
def main(self, opt): os.environ['CUDA_VISIBLE_DEVICES'] = '0' video_path = self.ent1.get().replace("/", "\\") image_feats = self.extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type( torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load("data/save/model_500.pth")) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] self.sent = NLUtils.decode_sequence(vocab, seq_preds) hasil = self.translator.translate(self.sent[0], dest='id') print(self.sent[0]) self.hasilPred.configure(text=self.sent[0]) self.hasiltrans.configure(text=hasil.text) # coba = self.sent[0] self.textToSpeech(self.sent[0], hasil.text) del seq_preds torch.cuda.empty_cache()