def main(opt): opt_test = opt test_dataset = VideoDataset(opt_test, 'test') opt_test["vocab_size"] = test_dataset.get_vocab_size() opt_test["seq_length"] = test_dataset.max_len dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) second_lstm = Two_Lstm( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) # bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, second_lstm, decoder) model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit, opt_test, test_dataset)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() get_caption(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN( opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) #model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam( model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') if opt["model"] == "S2VTModel": dataset = VideoDataset(opt, "test") elif opt["model"] == "S2VTACTModel": dataset = VideoActDataset(opt, "test") else: print('Currently not supported: {}'.format(opt["model"])) raise ValueError opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == "S2VTModel": model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).to(device) elif opt["model"] == "S2VTACTModel": model = S2VTACTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).to(device) elif opt["model"] == "S2VTAttModel": print('Currently Not Supported: {}'.format(opt["model"])) raise ValueError # model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), device, opt)
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray for video_path in opt['videos']: print(video_path) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) for sent in sents: print(sent)
def main(opt): dataset = VideoDataset(opt, 'test') opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.seq_length if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=0.2) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt.saved_model)) model.eval() crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True) global dataset_val global dataloader_val dataset_val = VideoDataset(opt, 'val') dataloader_val = DataLoader(dataset_val, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True) opt["vocab_size"] = dataset.get_vocab_size() encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN( opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = EncoderDecoderModel(encoder, decoder) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load('data/save_vatex_batch_noc3d/model_500.pth')) crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.parameters(),lr=opt["learning_rate"],weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"]) print("Data Loaded") train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): # DataLoader if opt["model"] == 'S2VTModel': dataset = VideoDataset(opt, 'train') elif opt["model"] == 'S2VTACTModel': dataset = VideoActDataset(opt, 'train') else: print('Currently Not Support this model: {}'.format(opt["model"])) raise ValueError dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': print(opt) model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == 'S2VTACTModel': print(opt) model = S2VTACTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": print('Currently not supported.') raise ValueError # Load model device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) # Criterions # LMCriterion = utils.LanguageModelCriterion() # rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, optimizer, exp_lr_scheduler, opt, device, LMCriterion)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = EncoderDecoderModel(encoder, decoder).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], num_workers=8, shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) model.load_state_dict( torch.load( "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth" )) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return if torch.cuda.device_count() > 1: print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) model = nn.DataParallel(model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): train_dataset = VideoDataset(opt, 'train') train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True) opt.vocab_size = train_dataset.get_vocab_size() opt.seq_length = train_dataset.seq_length val_dataset = VideoDataset(opt, 'val') val_dataloader = DataLoader(val_dataset, batch_size=120, shuffle=True) if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p) model = S2VTAttModel(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt.learning_rate_decay_every, gamma=opt.learning_rate_decay_rate) if not os.path.isdir(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) train(train_dataloader, val_dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def train(opt,EncoderRNN,DecoderCNN,Convcap,itow): ''' training initialize the models pass the arg through the Convcap model output: /checkpoint/model.pth ----- trained model ''' t_start = time.time() train_data=VideoDataset(opt, 'train') #####DataLODER##### #collate_fn=PadSequence(),collate_fn=collate_fn, train_loader=DataLoader(train_data, batch_size=opt["batch_size"],num_workers=opt['num_workers'], shuffle=True) print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start)) ##initialize encoder,decoder,model encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda() decoder=DecoderCNN.DecoderCNN(train_data.get_vocab_size()).cuda() convcap=Convcap.Convcap(encoder,decoder).cuda() ####initialize hyper params optimizer = optim.RMSprop(convcap.parameters(), lr=opt["learning_rate"]) scheduler = lr_scheduler.StepLR(optimizer, step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"]) batchsize =opt['batch_size'] cap_size= opt['max_len'] nbatches = np.int_(np.floor((len(train_data)*1.)/batchsize)) bestscore = .0 batchsize_cap = batchsize*1 max_tokens= opt['max_len'] # print(batchsize,cap_size,batchsize_cap,max_tokens) # print("nbatches"+str(nbatches)) itr = 0 loss_graph=[] graph_x=[] for epoch in range(opt['epochs']): loss_train = 0. scheduler.step() for data in train_loader: #for c3d_feat,lengths,labels,word_embed,masks,gts in train_loader: #print("came here") print("iteration"+str(itr)) itr+=1 vid_feat=Variable(data['c3d_feats']).cuda() labels = Variable(data['labels'].type(torch.LongTensor)).cuda() mask = Variable(data['masks']).cpu() word_embed=Variable(data['word_embed']).cuda() cap=data['cap'] # vid_feat=Variable(c3d_feat).cuda() # print(vid_feat.dtype) # print(vid_feat.shape) # labels = Variable(torch.FloatTensor(labels)).cuda() # mask = Variable(torch.FloatTensor(masks)).cpu() # word_embed= torch.stack([x for x in word_embed],dim=0) # print(word_embed.shape) # word_embed=Variable(word_embed).cuda() # lengths=lengths.type(torch.FloatTensor) # print(c3d_feat,word_embed) optimizer.zero_grad() wordact = convcap(vid_feat,labels,word_embed,'train') # print("//////////////////////////////////////////////") #print("1.wordact.shape"+str(wordact.shape)) wordact = wordact[:,:,:-1] #print("2.wordact.shape"+str(wordact.shape)) # print("////////////////////////////////////") labels = labels[:,1:] mask = mask[:,1:].contiguous() # print (wordact.shape) # print(batchsize_cap,max_tokens) wordact_t = wordact.permute(0, 2, 1).contiguous().view(\ batchsize*(max_tokens-1), -1) #print(wordact_t.shape) wordclass_t = labels.contiguous().view(\ batchsize*(max_tokens-1), 1) maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1) #print("mask Ids \t"+str(maskids)) loss = F.cross_entropy(wordact_t[maskids, ...], \ wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) ########### for visual############################## wordprobs = F.softmax(wordact_t).cpu().data.numpy() # print("word_class \t"+str(wordclass_t.shape)+"\t"+str(wordclass_t.dtype)) # print(wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) # print(wordact_t[maskids, ...]) wordids = np.argmax(wordprobs, axis=1) for i in wordids: print(itow[i]) # for i in wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]): # print(itow[i]) print(cap) ############################ ##################### if itr%500 == 0: graph_x.append(itr) loss_graph.append(loss) loss_train = loss_train + loss.item() loss.backward() optimizer.step() print("loss"+str(loss_train)) loss_train = (loss_train*1.)/(nbatches) print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train)) modelfn = osp.join(opt['checkpoint_path'], 'model_j_19_'+str(itr)+'.pth') torch.save({ 'epoch': epoch, 'state_dict': convcap.state_dict(), 'optimizer' : optimizer.state_dict(), 'loss':loss_train }, modelfn) print('time for epoch %f' % (time.time() - t_start)) plt.plot(graph_x,loss_graph,'ro') plt.show()
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) # # 声音encoder # encoder_voice = EncoderRNN( # opt["dim_voice"], # opt["dim_hidden"], # bidirectional=opt["bidirectional"], # input_dropout_p=opt["input_dropout_p"], # rnn_cell=opt['rnn_type'], # rnn_dropout_p=opt["rnn_dropout_p"]) # 手语encoder if opt['with_hand'] == 1: encoder_hand = EncoderRNN(opt["dim_hand"], opt["dim_hand_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"] + opt["dim_hand_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, encoder_hand, decoder) else: decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, None, decoder) # model = S2VTAttModel(encoder, encoder_voice, encoder_hand, decoder) model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) # print(dataloader) # print(crit) # print(optimizer) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def test(opt,EncoderRNN,DecoderCNN,Convcap,itow,wtoi,modelfn=None): ''' input : option, model, checkpoint/model.pth output :scores ''' t_start = time.time() t_start = time.time() test_data=VideoDataset(opt, 'test') test_loader=DataLoader(test_data, batch_size=opt["batch_size"],num_workers=30, shuffle=False) print('[DEBUG] Loading test data ... %f secs' % (time.time() - t_start)) batchsize =opt['batch_size'] cap_size= opt['max_len'] nbatches = np.int_(np.floor((len(test_data)*1.)/batchsize)) bestscore = .0 batchsize_cap = batchsize*1 max_tokens= opt['max_len'] if(modelfn is not None): encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda() decoder=DecoderCNN.DecoderCNN(test_data.get_vocab_size()).cuda() convcap=Convcap.Convcap(encoder,decoder).cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) convcap.load_state_dict(checkpoint['state_dict']) convcap.train(False) pred_captions = [] itr=0 for data in test_loader: print("iteration"+str(itr)) #print(data['labels'].data[0]) print("\n") print("gt\n") for i in data['labels'].data[0]: print(itow[int(i)]) itr+=1 vid_feat=Variable(data['c3d_feats']).cuda() labels=Variable(data['labels'].type(torch.LongTensor)).cuda() mask = Variable(data['masks']).cpu() word_embed=Variable(data['word_embed']).cuda() vid_id=data['video_ids'] #print(vid_id[0]) wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64') wordclass_feed[:,0] =wtoi['<sos>'] #1 #index of <sos> # print('wordclass_feed shape') # print(wordclass_feed.shape) outcaps = np.empty((batchsize, 0)).tolist() x_outcaps=np.empty((batchsize, 0)).tolist() for j in range(max_tokens-1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact = convcap(vid_feat,wordclass,word_embed,'test') x=convcap(vid_feat,labels,word_embed,'test') x=x[:,:,:-1] x_t=x.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1) x_prob=F.softmax(x_t).cpu().data.numpy() wordact = wordact[:,:,:-1] wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1) # print("convcap output"+str(wordact_t.shape)) wordprobs = F.softmax(wordact_t).cpu().data.numpy() x_id=np.argmax(x_prob, axis=1) wordids = np.argmax(wordprobs, axis=1) probs=np.max(wordprobs,axis=1) for k in range(batchsize): word = itow[wordids[j+k*(max_tokens-1)]] x_word=itow[x_id[j+k*(max_tokens-1)]] outcaps[k].append(word) x_outcaps[k].append(x_word) if(j < max_tokens-1): wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)] # print("word ids"+str(wordids[j+k*(max_tokens-1)])) for j in range(batchsize): num_words = len(outcaps[j]) x_n_words=len(x_outcaps[j]) if 'eos' in x_outcaps[j]: x_n_words=x_outcaps[j].index('eos') x_outcap=' '.join(x_outcaps[j][:num_words]) if 'eos' in outcaps[j]: num_words = outcaps[j].index('eos') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'vid_id': vid_id[0][5:], 'caption': outcap}) print("------------------------------------------------------------------------------") print("videoID \t"+str(vid_id)) print("caption \n") print(x_outcap) #print(itow[int(i)] for i in data['labels'].data[0]) print("------------------------------------------------------------------------------") scores = language_eval(pred_captions, '/home/sanjay/Documents/Video_convcap/output', 'test') return scores
for line in fp: video_id.append(line.strip()) fp.close() f = open(output_path, 'w') for i in range(len(results)): f.write(video_id[i] + ',' + results[i] + '\n') f.close() dim_vid = 4096 dim_hidden = 512 dim_word = 512 dataset = VideoDataset('generate', folder_path) vocab_size = dataset.get_vocab_size() seq_length = 25 encoder = Encoder(dim_vid, dim_hidden) decoder = Decoder(vocab_size, seq_length, dim_hidden, dim_word, rnn_dropout_p=0.2) model = Model(encoder, decoder).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load('./good_model.pth')) model.eval() test(model, dataset, dataset.get_vocab())