def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset) carlini.execute(video_path)
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool. # "batch_size": BATCH_SIZE, "c": 10000, "learning_rate": 0.2, "num_iterations": 1000, "input_shape": (224, 224), "num_frames": 288, "dimensions": 224, "k": 0.1, # "attack_algorithm": "showandfool" "attack_algorithm": "carliniwagner" } convnet = 'vgg16' # convnet = 'nasnetalarge' # convnet = 'resnet152' full_decoder = ConvS2VT(convnet, model, opt) ''' Layer freezing experiment. Top 10 contributing layers: conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight conv.cell_13.comb_iter_4_left.bn_sep_1.weight conv.reduction_cell_0.conv_prev_1x1.bn.weight conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight conv.cell_13.comb_iter_0_left.bn_sep_1.weight ''' top = open("top_layers.txt", "r") top_layers = top.readlines() top.close() print(top_layers) #set the gradients on the layers you don't want to contribute to 0 top_layers = [] for name, parameters in full_decoder.named_parameters(): reset = True for f in top_layers: if name in f: reset = False if reset: parameters.require_grad = False if parameters.grad is not None: print(name) parameters.grad.data.zero_() # for name, parameters in full_decoder.named_parameters(): # for f in top_layers: # if name not in f: # print(name) # parameters.require_grad = False # if parameters.grad is not None: # # parameters.data = 0 # parameters.grad.data.zero_() # else: # # print(name) # continue #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #target_caption = np.random.choice(viable_target_captions) # 5 captions: ''' <sos> A person is typing into a laptop computer <eos> <sos> A boy is kicking a soccer ball into the goal <eos> <sos> Someone is frying fish <eos> <sos> A dog is running with a ball <eos> <sos> The cat approaches on grass <eos> ''' captions = { 1: '<sos> A woman is talking <eos>', 2: '<sos> A boy is kicking a soccer ball into the goal <eos>', 3: '<sos> A man is frying fish <eos>', 4: '<sos> A dog is running with a ball <eos>', 5: '<sos> A cat is walking on grass <eos>' } #1 doesn't work videos = { #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22' #1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19.avi', 3: 'O2qiPS2NCeY_2_18.avi', 4: 'kI6MWZrl8v8_149_161.avi', 5: 'X7sQq-Iu1gQ_12_22.avi', 6: '77iDIp40m9E_159_181.avi', 7: 'SaYwh6chmiw_15_40.avi', 8: 'pFSoWsocv0g_8_17.avi', 9: 'HmVPxs4ygMc_44_53.avi', 10: 'glii-kazad8_21_29.avi', 11: 'AJJ-iQkbRNE_97_109.avi' } #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi" # video_path = '' video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[ 2] # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Just switch the number to get a target caption. target_caption = captions[1] #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96) #12 frames length = 3 print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = ['Adam', (0.9, 0.999)] crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window while (frame_counter < length): print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations))) iteration = iteration + 1 if length - frame_counter < BATCH_SIZE: window = [frame_counter, length] frame_counter = frame_counter + (length - frame_counter) print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) else: window = [frame_counter, frame_counter + BATCH_SIZE - 1] print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) frame_counter = frame_counter + BATCH_SIZE base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') print("\nSaving to: {}".format(adv_path)) # adv_frames_1 = np.concatenate(adv_frames, axis=0) # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # print("Adversarial Frames 1: {}".format(sents[0])) adv_frames = np.concatenate(adv_frames, axis=0) # batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Adversarial Frames 2: {}".format(sents[0])) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ #huffyuv is lossless. r10k is really good # '-c:v': 'libx264', #libx264 # use the h.264 codec '-c:v': 'huffyuv', #r210 huffyuv r10k # '-pix_fmt': 'rgb32', # '-crf': '0', # set the constant rate factor to 0, which is lossless # '-preset': 'ultrafast' # ultrafast, veryslow the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) writer.close() # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW') # np.save(np_path, adv_frames) #ffv1 0.215807946043995 #huffyuv 0.21578424050191813 #libx264 0.2341074901578537 #r210 -0.7831487262059795, -0.7833399258537526 #gif 0.6889478809555243 #png 0.2158991440582696 0.21616862708842177 #qtrle 0.21581286337807626 #flashsv 0.21610510459932186 0.21600030673323545 #ffvhuff 0.21620682250167533 #r10k similar to r210 #rawvideo 0.21595001 with torch.no_grad(): #getting a new model to see how it actually works now # full_decoder = ConvS2VT(convnet, model, opt) full_decoder = full_decoder.eval() frames = skvideo.io.vread(adv_path) frames = np.float32(frames) plt.imshow(frames[0] / 255.) plt.show() difference = np.array(adv_frames) - np.array(frames) np.save('difference_tmp', difference) #loadtxt to load np array from txt exp = np.load('difference_tmp.npy') # numpy_frames = np.load(np_path+'.npy') # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames)) # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference)) frames = frames + difference # batches = exp_create_batches(numpy_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption = sents[0] # # print("Numpy Frames exp: {}".format(numpy_caption)) # # numpy_frames_tensor = torch.tensor(numpy_frames) # numpy_frames_tensor = numpy_frames_tensor.float() # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption_tensor = sents[0] # # print("Numpy Frames tensor: {}".format(numpy_caption_tensor)) # numpy_frames = numpy_frames.astype(np.uint8) # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn) # # # batches = exp_create_batches(adv_frames, BATCH_SIZE) # # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Numpy Frames originalscale: {}".format(sents[0])) # # bp --- adv_frames = adv_frames.astype(np.uint8) batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames old: {}".format(sents[0])) batches = exp_create_batches(adv_frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames new: {}".format(sents[0])) frames = frames.astype(np.uint8) batches = create_batches(frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("frames old caption: ", sents[0]) # frames = frames.astype(np.uint8) # batches = create_batches(frames, load_img_fn, tf_img_fn) batches = exp_create_batches(frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len vocab = dataset.get_vocab() if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) video_path = opt['videos'][0] vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] # orig_captions = [' '.join(toks) for toks in dataset.vid_to_meta[vid_id]['final_captions']] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) target_caption = np.random.choice(viable_target_captions) interval = BATCH_SIZE num_seconds = 0.5 numIt = 4 # int(24 * num_seconds) real_len = len(skvideo.io.vread(video_path)) assert numIt <= real_len print("\t\t{} iterations to do.".format(numIt)) counter = 0 totalframes = [] adv_batches = [] while numIt > (interval - 1): window = range(counter, counter + interval) counter += interval carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) frames = carlini.execute(video_path, window=window, functional=True) totalframes.append(frames.detach().cpu().numpy()) adv_batches.append( create_batches(frames, batch_size=interval).detach().cpu().numpy()) numIt -= interval print("\t\tWindow {}".format(numIt)) if numIt > 0: window = range(counter, counter + numIt) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) frames = carlini.execute(video_path, window=window, functional=True) totalframes.append(frames.detach().cpu().numpy()) adv_batches.append( create_batches(frames, batch_size=interval).detach().cpu().numpy()) print("\t\tWindow {}".format(numIt)) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarial.avi') frames = np.concatenate(totalframes, axis=0) save_frames_to_video(frames, adv_path) batches = np.concatenate(adv_batches, axis=0) with torch.no_grad(): print(frames.shape) # bp --- seq_prob, seq_preds = full_decoder(batches, mode='inference', single_batch=False) sents = vcp_utils.decode_sequence(vocab, seq_preds) print(sents[0])
def main(opt): dataset = VideoDataset(opt, 'inference') time_stamp = get_time_stamp() if not os.path.isdir(os.path.join(opt['adv_dir'], time_stamp)): os.makedirs(os.path.join(opt['adv_dir'], time_stamp)) opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) video_names = os.listdir(opt['from_dir']) viable_ids = dataset.splits['test'] + dataset.splits['val'] for vn in video_names: video_path = os.path.join(opt['from_dir'], vn) vid_id = video_path.split('\\')[-1] vid_id = vid_id.split('.')[0] orig_captions = [ ' '.join(toks) for toks in dataset.vid_to_meta[vid_id]['final_captions'] ] original_caption = np.random.choice(orig_captions) viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] if len(toks) <= MAX_TARGET_LEN ] viable_target_captions.extend(plausible_caps) target_caption = np.random.choice(viable_target_captions) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset) stats_obj = carlini.execute(video_path, functional=True, stats=True) stats_obj['original_caption'] = original_caption stats_obj['target_caption'] = target_caption base_name = ''.join(vn.split('.')[:-1]) adv_path = os.path.join(opt['adv_dir'], time_stamp, base_name + '_adversarial.avi') adv_raw_path = os.path.join(opt['adv_dir'], time_stamp, base_name + '_adversarial.pkl') save_tensor_to_video(stats_obj['pass_in'], adv_path) save_tensor_to_video(stats_obj['delta'], adv_path) pickle_write(adv_raw_path, stats_obj)
def main(opt): def loss(seq_prob, crit): loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda()) return loss def produce_t_mask(): mask = torch.zeros(dataset.max_len) captions = [target_caption.split(' ')] gts = torch.zeros(len(captions), dataset.max_len).long() for i, cap in enumerate(captions): if len(cap) > dataset.max_len: cap = cap[:dataset.max_len] cap[-1] = '<eos>' for j, w in enumerate(cap): gts[i, j] = dataset.word_to_ix[w] label = gts[0] non_zero = (label == 0).nonzero() mask[:int(non_zero[0]) + 1] = 1 return label.unsqueeze(0), mask.unsqueeze(0) dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { "batch_size": BATCH_SIZE, "c": 100, "learning_rate": 0.005, "num_iterations": 1000, "input_shape": (299, 299), "num_frames": 288, "dimensions": 331 } convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #Random target caption # target_caption = np.random.choice(viable_target_captions) # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames length = math.ceil( len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) / 96) print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window frames = skvideo.io.vread(video_path)[0:BATCH_SIZE] original = torch.tensor(frames) original = (original.float()).cuda() batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer.zero_grad() cost.backward() original_grads = {} for name, parameter in full_decoder.named_parameters(): original_grads[name] = parameter.grad print(len(original_grads.keys())) # for key, value in original_grads.items(): # print(key) #Adversarial full_decoder = ConvS2VT(convnet, model, opt) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') adv_frames = skvideo.io.vread(adv_path) adv_frames = np.float32(adv_frames) adv_frames = torch.tensor(adv_frames) adv_frames = (adv_frames.float()).cuda() batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) optimizer.zero_grad() cost.backward() adv_grads = {} for name, parameter in full_decoder.named_parameters(): adv_grads[name] = parameter.grad # for key, value in adv_grads.items(): # print(key) print('\n\n\n------') for key, value in adv_grads.items(): if 'weight' in key: print(key) output = open("s2vt_weightoutput.txt", "w") l2norm_layers = [] for key, value in original_grads.items(): if 'weight' in key: if (value is not None): adv_weight = adv_grads[key] # print(value, adv_weight) diff = value - adv_weight net_change = np.linalg.norm(diff) / np.linalg.norm(value) output.write("{}, {}\n".format(key, net_change)) l2norm_layers.append([key, net_change]) output.close()
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #Random target caption # target_caption = np.random.choice(viable_target_captions) # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #/96 gives 3 frames length = len(skvideo.io.vread(video_path)) / 96 print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) while (frame_counter < length): print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations))) iteration = iteration + 1 if length - frame_counter < BATCH_SIZE: window = [frame_counter, length] frame_counter = frame_counter + (length - frame_counter) print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) finished_frames = carlini.execute(video_path, window=window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) else: window = [frame_counter, frame_counter + BATCH_SIZE - 1] print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) finished_frames = carlini.execute(video_path, window=window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) frame_counter = frame_counter + BATCH_SIZE base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') print("\nSaving to: {}".format(adv_path)) adv_frames = np.concatenate(adv_frames, axis=0) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ #huffyuv is lossless. r10k is really good # '-c:v': 'libx264', #libx264 # use the h.264 codec '-c:v': 'huffyuv', #r210 huffyuv r10k # '-pix_fmt': 'rgb32', # '-crf': '0', # set the constant rate factor to 0, which is lossless # '-preset': 'ultrafast' # ultrafast, veryslow the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) writer.close() #ffv1 0.215807946043995 #huffyuv 0.21578424050191813 #libx264 0.2341074901578537 #r210 -0.7831487262059795, -0.7833399258537526 #gif 0.6889478809555243 #png 0.2158991440582696 0.21616862708842177 #qtrle 0.21581286337807626 #flashsv 0.21610510459932186 0.21600030673323545 #ffvhuff 0.21620682250167533 #r10k similar to r210 #rawvideo 0.21595001 with torch.no_grad(): full_decoder = full_decoder.eval() frames = skvideo.io.vread(adv_path) frames = np.float32(frames) difference = np.array(adv_frames) - np.array(frames) np.save('difference_tmp', difference) #loadtxt to load np array from txt exp = np.load('difference_tmp.npy') print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference)) frames = frames + difference # bp --- adv_frames = adv_frames.astype(np.uint8) batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames old: {}".format(sents[0])) batches = exp_create_batches(adv_frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames new: {}".format(sents[0])) frames = frames.astype(np.uint8) batches = create_batches(frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("frames old caption: ", sents[0]) # frames = frames.astype(np.uint8) # batches = create_batches(frames, load_img_fn, tf_img_fn) batches = exp_create_batches(frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() length = len(skvideo.io.vread(video_path)) / 8 print("Total number of frames: {}".format(len( skvideo.io.vread(video_path)))) print("Total number of frames to do: {}".format(length)) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- attn_weights = [] total_iterations = np.ceil(length / BATCH_SIZE) iteration = 1 frame_counter = 0 while (frame_counter < length): if length - frame_counter < BATCH_SIZE: batches = create_batches(frames[frame_counter:int(length)], load_img_fn, tf_img_fn) attn = full_decoder(batches, mode='inference', get_attn=True) frame_counter = frame_counter + (length - frame_counter) else: batches = create_batches( frames[frame_counter:frame_counter + BATCH_SIZE - 1], load_img_fn, tf_img_fn) attn = full_decoder(batches, mode='inference', get_attn=True) frame_counter = frame_counter + BATCH_SIZE # print(attn.shape, attn[0].shape, type(attn)) attn = attn.cpu().detach().numpy().tolist()[0] print("Weights for batch {}: {}".format(iteration, attn)) for f in attn: attn_weights.append(f) iteration = iteration + 1 # attn_weights.append(attn.cpu().detach().numpy().tolist()[0]) batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference', get_attn=False) sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] print(attn_weights) att_window = np.sort( np.argpartition(attn_weights, -ATTACK_BATCH_SIZE)[-ATTACK_BATCH_SIZE:]).tolist() print("Indices of frames with highest attention weights: {}".format( att_window)) #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' adv_frames = [] carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, att_window=att_window) finished_frames = carlini.execute(video_path, att_window=att_window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarial.avi') print("\nSaving to: {}".format(adv_path)) adv_frames = np.concatenate(adv_frames, axis=0) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ '-vcodec': 'libx264', # use the h.264 codec '-crf': '0', # set the constant rate factor to 0, which is lossless '-vb': '50M', '-r': '25', '-preset': 'ultrafast' # the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) print(len(adv_frames)) # skvideo.io.vwrite(adv_path, adv_frames) writer.close() with torch.no_grad(): a_frames = skvideo.io.vread(adv_path) # frames = skvideo.io.vread(video_path) # for f in range(0, len(att_window)): # frames[att_window[f]] = a_frames[f] # frames = frames[:50] # frames = adv_frames # print(frames[[0, 1, 2, 3, 4, 5]].shape) # plt.imshow(frames[0]) # plt.show() # # plt.imshow(adv_frames[0]/255.) # plt.show() # bp --- batches = create_batches(a_frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' # convnet = 'resnet152' # convnet = 'vgg16' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) #D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\CNN_SaYwh6chmiw_15_40.npy videos = { # 1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19_adversarialWINDOW.avi', 3: 'O2qiPS2NCeY_2_18_adversarialWINDOW.avi', 4: 'kI6MWZrl8v8_149_161_adversarialWINDOW.avi', 5: 'X7sQq-Iu1gQ_12_22_adversarialWINDOW.avi', 6: '77iDIp40m9E_159_181_adversarialWINDOW.avi', 7: 'SaYwh6chmiw_15_40_adversarialWINDOW.avi', 8: 'pFSoWsocv0g_8_17_adversarialWINDOW.avi', 9: 'HmVPxs4ygMc_44_53_adversarialWINDOW.avi', 10: 'glii-kazad8_21_29_adversarialWINDOW.avi', 11: 'AJJ-iQkbRNE_97_109_adversarialWINDOW.avi' } videos_CNN = { # 1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19.avi', 3: 'O2qiPS2NCeY_2_18.avi', 4: 'kI6MWZrl8v8_149_161.avi', 5: 'X7sQq-Iu1gQ_12_22.avi', 6: '77iDIp40m9E_159_181.avi', 7: 'SaYwh6chmiw_15_40.avi', 8: 'pFSoWsocv0g_8_17.avi', 9: 'HmVPxs4ygMc_44_53.avi', 10: 'glii-kazad8_21_29.avi', 11: 'AJJ-iQkbRNE_97_109.avi' } #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # video_path = opt['videos'][0] modelname = 'nasnetalarge' o_video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos_CNN[ 2] video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}Adversarial_'.format(modelname) + \ videos_CNN[2] # video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\vgg16Adversarial_SaYwh6chmiw_15_40.avi' numpy_path = "D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}CNN_{}.npy".format( modelname, videos_CNN[2].split('.')[0]) adv_frames = np.load(numpy_path) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray print(video_path) with torch.no_grad(): frames = skvideo.io.vread(o_video_path) batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Original: ", sents[0]) frames = skvideo.io.vread(video_path) print("Total frames: {}".format(len(frames))) # print(frames[[0, 1, 2, 3, 4, 5]].shape) plt.imshow(frames[0] / 255.) plt.show() # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial huffyuv: ", sents[0]) np_frames = adv_frames.astype(np.uint8) print("Numpy CNN frames \nTotal frames: {}".format(len(np_frames))) # print(frames[[0, 1, 2, 3, 4, 5]].shape) plt.imshow(np_frames[0] / 255.) plt.show() # bp --- batches = create_batches(np_frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial numpy: ", sents[0])
def main(args, opt): testpath = 'D:\\College\\Research\\2019 Video Captioning Attack Conference Paper\\youtube2text_preprocessed_for_arctic_capgen_vid\\youtube2text_iccv15\\dict_movieID_caption.pkl' with open(testpath, 'rb') as f: data = pickle.load(f, encoding='latin1') print(data) dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { "batch_size": BATCH_SIZE, "c": 100, "learning_rate": 0.005, "num_iterations": 1000, "input_shape": (299, 299), "num_frames": 288, "dimensions": 331 } convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) # model = torch.nn.Sequential(torch.nn.Conv2d(in_channels=3, out_channels=96, kernel_size=3, padding=0, stride=2, # bias=False), full_decoder) #loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], num_workers=16, shuffle=True) crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)