def __getitem__(self, index): """ Args: index (int): Index Returns: tuple: (image, target) where target is class_index of the target class. """ path = self.data[index]['video'] frame_indices = self.data[index]['frame_indices'] clip = [] mvclip = [] gop_index = int(frame_indices[0] / 12) if not self.opt.residual_only: iframe = load(path, gop_index, 0, 0, True) iframe = self.tensor_2_image(iframe) clip.append(iframe) for frame_index in range(1, 12): residual = load(path, gop_index, frame_index, 2, True) residual += 128 residual = (np.minimum(np.maximum(residual, 0), 255)).astype(np.uint8) residual = self.tensor_2_image(residual) clip.append(residual) if self.opt.residual_only and frame_index == 1: ## double if we skip the iframe clip.append(residual) p = 0 new_clip = [] if self.spatial_transform is not None: self.spatial_transform.randomize_parameters() clip = [self.spatial_transform(img) for img in clip] clip = torch.stack(clip, 0).permute(1, 0, 2, 3) if self.opt.add and not self.opt.mv: l = 0 iframe = clip[:, 0, :, :] for mat in clip: if l != 0: residual_frame = clip[:, l, :, :] clip[:, l, :, :] = (residual_frame + iframe) / 2 l += 1 target = self.data[index] if self.target_transform is not None: target = self.target_transform(target) if self.opt.video_level_accuracy: return clip, target, self.data[index]['video'].split('/')[-1] else: return clip, target
def __getitem__(self, index): video_path = self.data['video_path'][index] gop_index = self.data['gop_index'][index] target = self.data['targets'][index] # print(video_path, gop_index, target) if self._representation == 'iframe': frames_i = [] img_i = load(video_path, gop_index, 0, 0, self._accumulate) img_i = color_aug(img_i) img_i = img_i[..., ::-1] frames_i.append(img_i) frames_i = self._transform(frames_i) frames_i = np.array(frames_i) frames_i = np.transpose(frames_i, (0, 3, 1, 2)) input_i = torch.from_numpy(frames_i).float() / 255.0 input_i = (input_i - self._input_mean) / self._input_std input = input_i if self._representation == 'mv': frames_m = [] img_m = load(video_path, gop_index, 6, 1, self._accumulate) img_m = clip_and_scale(img_m, 20) img_m += 128 img_m = (np.minimum(np.maximum(img_m, 0), 255)).astype(np.uint8) frames_m.append(img_m) frames_m = self._transform(frames_m) frames_m = np.array(frames_m) frames_m = np.transpose(frames_m, (0, 3, 1, 2)) input_m = torch.from_numpy(frames_m).float() / 255.0 input_m = (input_m - 0.5) input = input_m if self._representation == 'r': frames_r = [] img_r = load(video_path, gop_index, 6, 2, self._accumulate) img_r += 128 img_r = (np.minimum(np.maximum(img_r, 0), 255)).astype(np.uint8) frames_r.append(img_r) frames_r = self._transform(frames_r) frames_r = np.array(frames_r) frames_r = np.transpose(frames_r, (0, 3, 1, 2)) input_r = torch.from_numpy(frames_r).float() / 255.0 input_r = (input_r - 0.5) / self._input_std input = input_r # print(input.shape) # target = target.long() # print(target.shape) return input, target
def main(): #for video_name in video_names[:2]: for video_name in video_names: fold_path = video_name.split('.avi')[0].split('/')[-1] path_mv = os.path.join(fold_path, PATH_MV_CONT) path_res = os.path.join(fold_path, PATH_RES_CONT) if not os.path.exists(path_mv): os.makedirs(path_mv) if not os.path.exists(path_res): os.makedirs(path_res) NUM_FRAMES = get_num_frames(video_name) print(NUM_FRAMES) # The index of GOP curGopIdx = 0 for curGopIdx in range(max(NUM_FRAMES // GOP_FRAMES_NUM, 1)): for innerGopIdx in range(GOP_FRAMES_NUM): curFrameIdx = curGopIdx * GOP_FRAMES_NUM + innerGopIdx #rgbFrame = load(video_name, curGopIdx, innerGopIdx, 0, True) #start = time.time() print(video_name, curGopIdx, innerGopIdx) mvCont_origin = load(video_name, curGopIdx, innerGopIdx, 1, False) resCont = load(video_name, curGopIdx, innerGopIdx, 2, False) if mvCont_origin is None: mvCont_origin = np.zeros([720,960,2], dtype=np.uint8) mvCont = mvCont_origin + 2048 # (high_h, low_h, high_w, low_w) mvPng = np.array([((mvCont[:,:,0] >> 8) & 0xff) , (mvCont[:,:,0] & 0xff), ((mvCont[:,:,1] >> 8) & 0xff), (mvCont[:,:,1] & 0xff)], dtype = np.uint8) mvPng = np.transpose(mvPng, [1,2,0]) imsave(path_mv+'/frame'+str(curFrameIdx)+'.png', mvPng) #save_mvPng = imread(path_mv+'/frame'+str(curFrameIdx)+'.png').astype(np.int16) #reload_mvCont = np.array([ (save_mvPng[:,:,0] << 8) + (save_mvPng[:,:,1]), (save_mvPng[:,:,2] << 8) + (save_mvPng[:,:,3]) ]) #reload_mvCont = np.transpose(reload_mvCont, [1,2,0]) #reload_mvCont -= 2048 #print((reload_mvCont == mvCont_origin).min()) if resCont is None: resCont = np.zeros([720,960,3], dtype=np.uint8) resCont = np.round((resCont + 256)/2).astype(np.uint8) #resCont = np.abs(resCont) imsave(path_res+'/frame'+str(curFrameIdx)+'.png', resCont) cv2.imwrite(PATH_RES_CONT+fold_path+'.png', resCont)
def _load_video(self, video_name): #選擇擷取特徵 representation_idx = 0 if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 #計算片段數 total_frames = get_num_frames(video_name) total_segments = total_frames // SEG_SIZE #把每個片段中間那幀紀錄下來 frames = [] for i in range(total_segments): gop_idx, gop_pos = self._get_frame_index(total_frames, i) img = load(video_name, gop_idx, gop_pos, representation_idx, self._accumulate) roi_img = img[int(ROI_Y):int(ROI_Y+ROI_HEIGHT), int(ROI_X):int(ROI_X+ROI_WIDTH)] frames.append(roi_img) #預設是每3個片段辨識一個動作 for i in range(2, len(frames)): tmp = [] tmp.append(frames[i-2]) tmp.append(frames[i-1]) tmp.append(frames[i]) self._frames.append(tmp) frames.clear()
def frame_callback(vis, frame_idx): # print('Processing frame: ', frame_idx) # Load image and generate detections. # Update visualization. GROUP_SIZE = 12 # the number of frames in one group. We set to 12 for the raw mpeg4 video. gop_idx = int( (frame_idx - 1) / GROUP_SIZE ) # GOP starts from 0, while frame_idx here starts from 1. in_group_idx = int( (frame_idx - 1) % GROUP_SIZE) # the index in the group image = coviar.load(video_path, gop_idx, in_group_idx, frame_type, accumulate) image = compressed_frame_to_show(image, frame_type, tool_type='cv2') vis.set_image(image.copy(), frame_idx) raw_box = seq_info['boxes'] if raw_boxes is not None: index = raw_box[:, 0] == frame_idx box = raw_box[index] index = box[:, 6] >= min_confidence box = box[index] box = box[:, 1:7] # [target_id, x, y, w, h] box_list = [] for idx in range(box.shape[0]): box_list.append(box[idx, :]) vis.draw_box(box_list)
def __getitem__(self, index): if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 else: representation_idx = 0 if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] frames = [] for seg in range(self._num_segments): if self._is_train: gop_index, gop_pos = self._get_train_frame_index(num_frames, seg) else: gop_index, gop_pos = self._get_test_frame_index(num_frames, seg) img = load(video_path, gop_index, gop_pos, representation_idx, self._accumulate) if img is None: print('Error: loading video %s failed.' % video_path) img = np.zeros((256, 256, 2)) if self._representation == 'mv' else np.zeros((256, 256, 3)) else: if self._representation == 'mv': img = clip_and_scale(img, 20) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) elif self._representation == 'residual': img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) if self._representation == 'iframe': img = color_aug(img) # BGR to RGB. (PyTorch uses RGB according to doc.) img = img[..., ::-1] frames.append(img) frames = self._transform(frames) frames = np.array(frames) frames = np.transpose(frames, (0, 3, 1, 2)) input = torch.from_numpy(frames).float() / 255.0 if self._representation == 'iframe': input = (input - self._input_mean) / self._input_std elif self._representation == 'residual': input = (input - 0.5) / self._input_std elif self._representation == 'mv': input = (input - 0.5) return input, label
def process_segment_consecutive(self, frames, gop_index, gop_pos, video_path,representation_idx): # if self._is_train: # gop_index, gop_pos = self._get_train_frame_index(num_frames, seg) # else: # gop_index, gop_pos = self._get_test_frame_index(num_frames, seg) # returns image of the specified frame img = load(video_path, gop_index, gop_pos, representation_idx, self._accumulate) if img is None: print('Error: loading video %s failed.' % video_path) img = np.zeros((256, 256, 2)) if self._representation == 'mv' else np.zeros((256, 256, 3)) else: if self._representation == 'mv': img = clip_and_scale(img, 20) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) elif self._representation == 'residual': img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) if self._representation == 'iframe': img = color_aug(img) # BGR to RGB. (PyTorch uses RGB according to doc.) img = img[..., ::-1] frames.append(img) return frames
def load_segment(is_train, num_frames, seg, representation, num_segments, video_path, representation_idx, accumulate): if is_train: gop_index, gop_pos = get_train_frame_index(num_frames, seg, representation, num_segments) else: gop_index, gop_pos = get_test_frame_index(num_frames, seg, representation, num_segments) img = load(video_path, gop_index, gop_pos, representation_idx, accumulate) if img is None: print('Error: loading video %s failed.' % video_path) img = np.zeros((256, 256, 2)) if representation == 'mv' else np.zeros( (256, 256, 3)) else: if representation == 'mv': img = clip_and_scale(img, 20) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) elif representation == 'residual': img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) if representation == 'iframe': if is_train: img = color_aug(img) # BGR to RGB. (PyTorch uses RGB according to doc.) img = img[..., ::-1] return img
def load_frame_from_compressed_video(video_path, frame_id, frame_type, accumulated, group_size=12): """ This function load the frame from a compressed raw video. :param video_path: the path to mp4 raw video :param frame_id: int, starts from 1 :param frame_type: int, 0 for I frame (also the image), 1 for motion vector, 2 for residual :param accumulated: bool, determin whether to loaded accumulated mv or residual :param group_size: GOP, default is 12 :return: ndarray, the loaded frame. For I fame and residual, it has format BGR, for motion vector, the 0-th and 1-th channel are x and y offsets respectively. """ gop_idx = int( (frame_id - 1) / group_size ) # GOP starts from 0, while frame_id here starts from 1. in_group_idx = int( (frame_id - 1) % group_size) # the index in the group frame_load = coviar.load(video_path, gop_idx, in_group_idx, frame_type, accumulated) return frame_load
def _parse_function_v2(filename, label, nSegments): reps_np = [] for representation_idx in range(0, 3): frames = [] for seg_idx in range(0, nSegments): #print(filename.decode()) nFrames = get_num_frames(filename.decode()) #print('nFrames:',nFrames) gop_index, gop_pos = getTrainFrameIndex(nFrames, seg_idx, nSegments, representation_idx) #print('gop_index, gop_pos:', gop_index, gop_pos) img = load(filename.decode(), gop_index, gop_pos, representation_idx, True) #print('H3') if img is None: #print('Error: loading video %s failed.' % filename.decode()) img = np.zeros((256, 256, 3)) else: if representation_idx == 1: img = (img * (127.5 / 20)).astype(np.int32) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) img = np.append(img, np.zeros_like(img[..., 0, None]), axis=-1) elif representation_idx == 2: img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) else: img = img[..., ::-1] #flipping to RGB #print('H4') frames.append(img) #np_frames = np.transpose(np.array(frames).astype(np.float32), (0, 3, 1, 2)) / 255.0 np_frames = np.array(frames).astype(np.float32) / 255.0 if representation_idx == 0: np_frames = (np_frames - DATA_MEAN) / DATA_STD elif representation_idx == 2: np_frames = (np_frames - 0.5) / DATA_STD elif representation_idx == 1: np_frames = (np_frames - 0.5) np_frames = np_frames[:, 16:240, 52:276, :].astype(np.float32) reps_np.append(np_frames) return reps_np[0], reps_np[1], reps_np[2], label
def video2mv(path_to_video, target_directory): num_group = coviar.get_num_gops(path_to_video) for i in range(0, num_group): for j in range(0, 12): idx = i * 12 + j start = timer() mv = coviar.load(path_to_video, i, j, 1, True) mv_path = '%06d' % idx mv_path = target_directory + '/' + mv_path + '.pkl' pickle.dump(mv, open(mv_path, 'wb')) end = timer() print(timedelta(seconds=end - start))
def video2mv_collection(path_to_video, target_directory, collection): num_group = coviar.get_num_gops(path_to_video) for idx in collection: idx_int = int(idx) group_idx = idx_int // 12 frame_idx = idx_int % 12 + 5 try: mv = coviar.load(path_to_video, group_idx, frame_idx, 1, True) mv = mv.astype('int8') mv_path = '%06d' % idx_int mv_path = target_directory + '/' + mv_path + '.pkl' pickle.dump(mv, open(mv_path, 'wb'), protocol=2) except: print("Error\n\n\n\n\n") error_recorder.append(mv_path) return loaded_mv = load_mv(mv_path) assert (loaded_mv == mv).all()
def video2mv_collection(path_to_video, target_directory, startIdx, seg_len): for idx in range(seg_len): group_idx = idx // 12 frame_idx = idx % 12 try: mv = coviar.load(path_to_video, group_idx, frame_idx, 1, True) # mv_path = '%06d' % (idx + startIdx) mv_path = '%06d' % (idx) mv_path = target_directory + '/' + mv_path + '.pkl' mv = mv.astype('int8') pickle.dump(mv, open(mv_path, 'wb'), protocol=2) except: print("Error\n\n\n\n\n") error_recorder.append(mv_path) return loaded_mv = load_mv(mv_path) assert (loaded_mv == mv).all()
def __getitem__(self, index): if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 else: representation_idx = 3 # True:随机选取batch_size个视频 if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] #print(video_path) frames = [] for seg in range(self._num_segments): if self._is_train: gop_index, gop_pose = self._get_train_frame_index( num_frames, seg) else: gop_index, gop_pose = self._get_test_frame_index( num_frames, seg) img = load(video_path, gop_index, gop_pose, representation_idx, self._accumulate) #print(img.shape) if img is None: print('Error: loading compressed video {} failed.'.format( video_path)) img = np.zeros( (256, 256, 2)) if self._representation == 'mv' else np.zeros( (256, 256, 3)) else: if self._representation == 'mv': img = clip_and_scale(img, 20) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) elif self._representation == 'residual': img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) if self._representation == 'iframe': img = color_aug(img) # BGR to RGB img = img[..., ::-1] frames.append(img) frames = self._transform(frames) frames = np.array(frames) frames = np.transpose(frames, (0, 3, 1, 2)) input = torch.from_numpy(frames).float() / 255.0 if self._representation == 'iframe': input = (input - self._input_mean) / self._input_std elif self._representation == 'residual': input = (input - 0.5) / self._input_std elif self._representation == 'mv': input = (input - 0.5) # 为了与raw_data维度保持一致,transpose针对nparray变化维度,permute针对tensor变化维度 input = input.permute(1, 0, 2, 3) print(input.shape) return input, label
def _perturbation_image(model, original_image, ori_label, video_path, save_path, transform_post, args, config, device): original_image = original_image.to(device) total_frames = get_num_frames(video_path) original_image_ = original_image.clone() # torch.Size([1, 3, 72, 84, 84]) num_frame, channel, height, width = original_image.shape dim = height * width * channel loop = 0 inner_loop = 0 success = False num_query = 0 num_pframe = 0 max_query = 60000 exploration = 0.1 fd_eta = 0.1 online_lr = 0.1 flow_lr = 0.025 target_label = (ori_label + 1) % args.num_classes ''' while target_label == ori_label: target_label = torch.tensor([random.sample(range(174), 1)[0]]).cuda() ''' motion_vector = list() prior = torch.zeros(num_frame, channel, height, width).to(device) delta = torch.zeros(num_frame, channel, height, width).to(device) est_grad = torch.zeros(num_frame, channel, height, width).to(device) adv_img = torch.zeros(3, num_frame, channel, height, width).to(device) iframe = torch.zeros(num_frame, height, width, channel).to(device) noise_frames = torch.zeros(num_frame, channel, height, width).to(device) index_visual = torch.zeros(num_frame, 2, height, width).to(device) index_motion = torch.zeros(num_frame, height, width, 2).to(device) while not (num_query > max_query): pred_adv_logit = list() start1 = time.time() end_index = total_frames // GOP_SIZE if loop % args.interval == 0: # can also try 8 for tsn2d #mv_index = int(torch.rand(1)*end_index) mv_index = inner_loop % end_index mv = load(video_path, mv_index, 11, 1, True) mv = mv - mv.min() mv = np.dstack((mv, np.zeros((mv.shape[:2] + (1, ))))) mv = [mv.astype(np.uint8)] * num_frame inner_loop += 1 motion_vector = transform_post(mv) motion_vector = np.stack(motion_vector, axis=0) * 255 motion_vector = torch.from_numpy(motion_vector).permute( 0, 2, 3, 1).float().to(device) motion_vector[:, :, :, 0] = (2 * motion_vector[:, :, :, 0] - height + 1.) / (height - 1.) motion_vector[:, :, :, 1] = (2 * motion_vector[:, :, :, 1] - width + 1.) / (width - 1.) noise_frames = torch.randn(1, 3, height, width).repeat(num_frame, 1, 1, 1).to(device) noise_frames = F.grid_sample(noise_frames, motion_vector[:, :, :, :2]) exp_noise = exploration * noise_frames q1 = prior + exp_noise q2 = prior - exp_noise adv_img[0] = original_image + fd_eta * q1 / norm2(q1) adv_img[1] = original_image + fd_eta * q2 / norm2(q2) adv_img[2] = original_image for i in range(3): img_group = normalization(adv_img[i].clone().cpu().numpy(), args) tmp_result = model(img_group.astype('float32', copy=False)) tmp_result = FF.mean(tmp_result, axis=0, keepdims=True) tmp_result = torch.from_numpy(tmp_result.asnumpy()).to(device) pred_adv_logit.append(tmp_result) l1, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[0], ori_label, target_label, delta) l2, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[1], ori_label, target_label, delta) loss, target, real, other, other_class, second_logit, second_class = _pert_loss( pred_adv_logit[2], ori_label, target_label, delta) num_query += 3 est_deriv = (l1 - l2) / (fd_eta * exploration * exploration) est_grad = est_deriv.item() * exp_noise prior += online_lr * est_grad original_image = original_image - flow_lr * prior.sign() delta = original_image_ - original_image tmp_norm = norm2(delta) original_image = torch.max( torch.min(original_image, original_image_ + 0.03), original_image_ - 0.03) original_image = torch.clamp(original_image, 0, 1) pred_adv_label = pred_adv_logit[2].argmax() if (loop % 1000 == 0) or (loop == max_query) or pred_adv_label != ori_label: #if (loop % 1000 ==0) or (loop == max_query) or pred_adv_label == target_label: print('[T2]{:.3f}s for [{}]-th loop\t' 'Queries {:03d}\t' 'Overall loss {:.3f}\t' 'est_deriv {:.3f}\t' 'Target {}\t' 'Target logit {:.3f}\t' 'ori logit {:.3f}\t' 'ori class {}\t' 'second logit {:.3f}\t' 'second class {}\t'.format(time.time() - start1, loop, num_query, loss, est_deriv.item(), target, real, other, other_class, second_logit, second_class)) loop += 1 if pred_adv_label != ori_label: #if pred_adv_label == target_label: #print('Predicted label is {}\t'.format(pred_adv_label)) diff = adv_img[2] - original_image_ print('diff max {:.3f}, diff min {:.3f}'.format( diff.max(), diff.min())) success = True #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path) break if num_query >= max_query: #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path) break return pred_adv_label, num_query, success
def __getitem__(self, index): if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] num_gop = num_frames // GOP_SIZE for gop in range(num_gop): frames_i = [] frames_m = [] frames_r = [] img_i = load(video_path, gop, 0, 0, self._accumulate) img_i = color_aug(img_i) img_i = img_i[..., ::-1] img_m = load(video_path, gop, 6, 1, self._accumulate) img_m = clip_and_scale(img_m, 20) img_m += 128 img_m = (np.minimum(np.maximum(img_m, 0), 255)).astype(np.uint8) img_r = load(video_path, gop, 6, 2, self._accumulate) img_r += 128 img_r = (np.minimum(np.maximum(img_r, 0), 255)).astype(np.uint8) frames_i.append(img_i) frames_m.append(img_m) frames_r.append(img_r) frames_i = self._transform_i(frames_i) frames_m = self._transform_m(frames_m) frames_r = self._transform_r(frames_r) frames_i = np.array(frames_i) frames_m = np.array(frames_m) frames_r = np.array(frames_r) frames_i = np.transpose(frames_i, (0, 3, 1, 2)) frames_m = np.transpose(frames_m, (0, 3, 1, 2)) frames_r = np.transpose(frames_r, (0, 3, 1, 2)) input_i = torch.from_numpy(frames_i).float() / 255.0 input_m = torch.from_numpy(frames_m).float() / 255.0 input_r = torch.from_numpy(frames_r).float() / 255.0 input_i = (input_i - self._input_mean) / self._input_std input_m = (input_m - 0.5) input_r = (input_r - 0.5) / self._input_std # print(input_i.shape) # a=input_i.view((-1, ) + input_i.size()[-3:]) # print(a.shape) # print(input_m.shape) # print(input_r.shape) input1 = torch.cat((input_i, input_m, input_r), 1) # print(input1.shape) if gop == 0: input = input1 else: input = torch.cat((input, input1), 0) # print(input.shape) # a=input.view((-1, ) + input.size()[-3:]) # print(a.shape) # print(input) return input, label
def mploader(video_prefix, vid_subpath, sampled_idxs, modality, accumulate, ds_factor, _mv_minmaxnorm, mv_loadimg): if not mv_loadimg: from coviar import get_num_frames from coviar import load image_loader = get_default_image_loader() mv = [] res = [] index_pos = [] video = [] # change this part according your relative path between frame image and videos and stored MV and R if video_prefix == '/HMDB51/fb/TSN_input/': video_path = os.path.join('/HMDB51/fb/videos_mpeg4/', vid_subpath + '.mp4') if not mv_loadimg: video_prefix2 = '/' else: video_prefix2 = '/HMDB51/fb/MC_input/' elif video_prefix == '/UCF101/TSN_input/': video_path = os.path.join('/UCF101/fb/mpeg4_videos/', vid_subpath + '.mp4') if not mv_loadimg: video_prefix2 = '/' else: video_prefix2 = '/UCF101_MC_input/' for frame_idx in sampled_idxs: index_pos.append(get_gop_pos(frame_idx, 'mv')) if modality == 'mv': for j, item in enumerate(index_pos): image_path = os.path.join( video_prefix2, vid_subpath, 'mv_x_{:05d}.jpg'.format(sampled_idxs[j] + 1)) if mv_loadimg and os.path.exists(image_path): img = np.asarray( image_loader( image_path, 'flow', os.path.join( video_prefix2, vid_subpath, 'mv_y_{:05d}.jpg'.format(sampled_idxs[j] + 1)))).astype(np.float) img += -128 else: gop_index, gop_pos = item img = load(video_path, gop_index, gop_pos, 1, accumulate) if _mv_minmaxnorm == 1: img = clip_and_scale(img, 20) # scale values from +-20 to +-127.5 img += 128 img = (np.minimum(np.maximum(img, 0), 255)) mv.append(img.astype(np.uint8)) return mv elif modality == 'res': for j, item in enumerate(index_pos): image_path = os.path.join( video_prefix2, vid_subpath, 'res_{:05d}.jpg'.format(sampled_idxs[j] + 1)) if mv_loadimg and os.path.exists(image_path): img = np.asarray(image_loader(image_path, 'rgb')) else: gop_index, gop_pos = item img = load(video_path, gop_index, gop_pos, 2, accumulate) img += 128 img = (np.minimum(np.maximum(img, 0), 255)) res.append(img.astype(np.uint8)) return res elif modality == 'I': for item in index_pos: gop_index, gop_pos = item img = load(video_path, gop_index, 0, 0, accumulate) img = img[..., ::-1] res.append(img.astype(np.uint8)) return res elif modality == 'flow+mp4': for i in range(len(sampled_idxs)): image_path = os.path.join( video_prefix, vid_subpath, 'flow_x_{:05d}.jpg'.format(sampled_idxs[i] + 1)) if os.path.exists(image_path): flow = image_loader( image_path, 'flow', os.path.join( video_prefix, vid_subpath, 'flow_y_{:05d}.jpg'.format(sampled_idxs[i] + 1))) image_path = os.path.join( video_prefix2, vid_subpath, 'mv_x_{:05d}.jpg'.format(sampled_idxs[i] + 1)) if mv_loadimg and os.path.exists(image_path): img = image_loader( image_path, 'flow', os.path.join(video_prefix2, vid_subpath, 'mv_y_{:05d}.jpg'.format(sampled_idxs[i] + 1))).astype(np.float) img += -128 else: gop_index, gop_pos = index_pos[i] img = load(video_path, gop_index, gop_pos, 1, accumulate) if _mv_minmaxnorm == 1: img = clip_and_scale(img, 20) # scale values from +-20 to +-127.5 img += 128 img = (np.minimum(np.maximum(img, 0), 255)) flow = np.concatenate((flow, img), axis=2) image_path = os.path.join( video_prefix2, vid_subpath, 'res_{:05d}.jpg'.format(sampled_idxs[i] + 1)) if mv_loadimg and os.path.exists(image_path): img = image_loader(image_path, 'rgb') else: gop_index, gop_pos = index_pos[i] img = load(video_path, gop_index, gop_pos, 2, accumulate) img += 128 img = (np.minimum(np.maximum(img, 0), 255)) flow = np.concatenate((flow, img), axis=2) video.append(flow.astype(np.uint8)) return video
return img file_path = "output.mp4" totalTime = timedelta(seconds=0) count = 0 for i in range(0, 10): for j in range(0, 12): # Added count += 1 start = timer() image = coviar.load(file_path, i, j, 0, True) #plt.imshow(image) #plt.savefig(str(i)+'_' +str(j) +'_base.jpg') # Added end = timer() totalTime += timedelta(seconds=end - start) print(timedelta(seconds=end - start)) image = coviar.load(file_path, i, j, 1, True) #image = flow_to_img(image) #plt.imshow(image) #plt.savefig(str(i)+'_' +str(j) +'_mv.jpg') image = coviar.load(file_path, i, j, 2, True) image = np.asarray(image)
import argparse import numpy as np import skimage.io as io from coviar import load if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--load_dir", type=str) parser.add_argument("--gop_idx", type=int, default=0) parser.add_argument("--frame_idx", type=int, default=0) parser.add_argument("--repr", type=int, default=0) parser.add_argument("--accumulate", type=bool, default=True) parser.add_argument("--save_dir", type=str, default="./frame.png") args = parser.parse_args() frame = load(args.load_dir, args.gop_idx, args.frame_idx, args.repr, args.accumulate) print(np.max(frame), frame[frame < 0], frame.dtype, frame.shape) # io.imsave(args.save_dir, frame)
def _get_image_blob(roidb, target_size, im_type=['im', 'residual', 'mv']): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] im_scales = [] for i in range(num_images): import os import coviar frame_path = roidb[i]['image'] # '/data0/liuqk/MOTChallenge/2DMOT2015/train/ETH-Bahnhof/img1/000229.jpg' if frame_path == '/data0/liuqk/MOTChallenge/2DMOT2015/train/KITTI-17/img1/000145.jpg': frame_path = '/data0/liuqk/MOTChallenge/2DMOT2015/train/KITTI-17/img1/000144.jpg' frame_path_info = frame_path.split('/') if 'motchallenge' in roidb[i]['dataset_name']: # ['', 'data0', 'liuqk', 'MOTChallenge', '2DMOT2015', 'train', 'ETH-Bahnhof', 'img1', '000229.jpg'] # get seq_path seq_path = '/' for j in range(len(frame_path_info) - 2): seq_path = os.path.join(seq_path, frame_path_info[j]) # get the video path video_path = os.path.join(seq_path, frame_path_info[-3] + '.mp4') elif 'citypersons' in roidb[i]['dataset_name']: # ['', 'data0', 'liuqk', 'Cityscapes', 'citysacpesdataset', 'leftImg8bit', # 'train', 'tubingen', 'tubingen_000082_000019_leftImg8bit', '000001.png'] # get seq_path seq_path = '/' for j in range(len(frame_path_info) - 1): seq_path = os.path.join(seq_path, frame_path_info[j]) # get the video path video_path = os.path.join(seq_path, frame_path_info[-2] + '.mp4') if not os.path.exists(video_path): raise RuntimeError(video_path + ' does not exists') frame_id = int(frame_path_info[-1][0:6]) gop_idx = int((frame_id - 1) / 12) # GOP starts from 0, while frame_id here starts from 1. in_group_idx = int((frame_id - 1) % 12) # the index in the group if 'im' in im_type: im = coviar.load(video_path, gop_idx, in_group_idx, 0, True) if len(im.shape) == 2: im = im[:, :, np.newaxis] im = np.concatenate((im, im, im), axis=2) im, im_scale = prep_im_for_blob(im=im, pixel_normal_scale=cfg.PIXEL_NORMAL_SCALE, pixel_means=cfg.PIXEL_MEANS, pixel_stds=cfg.PIXEL_STDS, target_size=target_size[i], channel=cfg.PIXEL_CHANNEL) if roidb[i]['flipped']: im = im[:, ::-1, :] im_shape = im.shape else: im = None if 'mv' in im_type: mv = coviar.load(video_path, gop_idx, in_group_idx, 1, True) mv, im_scale = prep_mv_for_blob(im=mv, mv_normal_scale=cfg.MV_NORMAL_SCALE, mv_means=cfg.MV_MEANS, mv_stds=cfg.MV_STDS, target_size=target_size[i], channel=cfg.MV_CHANNEL) if roidb[i]['flipped']: mv = mv[:, ::-1, :] mv[:, :, 0] = - mv[:, :, 0] im_shape = mv.shape else: mv = None if 'residual' in im_type: residual = coviar.load(video_path, gop_idx, in_group_idx, 2, True) # check whether it is a gray image if len(residual.shape) == 2: residual = residual[:, :, np.newaxis] residual = np.concatenate((residual, residual, residual), axis=2) residual, im_scale = prep_residual_for_blob(im=residual, pixel_normal_scale=cfg.RESIDUAL_NORMAL_SCALE, pixel_means=cfg.RESIDUAL_MEANS, pixel_stds=cfg.RESIDUAL_STDS, target_size=target_size[i], channel=cfg.RESIDUAL_CHANNEL) if roidb[i]['flipped']: residual = residual[:, ::-1, :] im_shape = residual.shape else: residual = None im_data = np.zeros((im_shape[0], im_shape[1], 3+2+3)) if im is not None: im_data[:,:,0:3] = im if mv is not None: im_data[:,:,3:5] = mv if residual is not None: im_data[:,:,5:8] = residual # # ------------ show some results ------------------------ # from lib.model.utils.misc import show_compressed_frame # show_compressed_frame(np.array(im_data[:,:,0:3] + cfg.PIXEL_MEANS, dtype=np.uint8), 0) # show_compressed_frame(im_data[:, :, 3:5], 1) # show_compressed_frame(np.array(im_data[:, :, 5:8], dtype=np.uint8), 2) im_scales.append(im_scale) processed_ims.append(im_data) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, im_scales
mv = mv.astype('int8') mv_path = '%06d' % idx_int mv_path = target_directory + '/' + mv_path + '.pkl' pickle.dump(mv, open(mv_path, 'wb'), protocol=2) except: print("Error\n\n\n\n\n") error_recorder.append(mv_path) return loaded_mv = load_mv(mv_path) assert (loaded_mv == mv).all() for idx in range(num_group): for j in range(12): mv = coviar.load(path_to_video, idx, j, 1, True) mv = mv.astype('int8') mv_path = '%06d' % (idx * 12 + j) mv_path = mv_path + '.pkl' pickle.dump(mv, open(mv_path, 'wb'), protocol=2) for idx in collection: idx_int = int(idx) group_idx = idx_int // 12 frame_idx = idx_int % 12 + 5 try: mv = coviar.load(path_to_video, group_idx, frame_idx, 1, True) mv = mv.astype('int8') mv_path = '%06d' % idx_int mv_path = target_directory + '/' + mv_path + '.pkl' pickle.dump(mv, open(mv_path, 'wb'), protocol=2)
def __getitem__(self, index): if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 else: representation_idx = 0 if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] frames = [] for seg in range(self._num_segments): if self._is_train: gop_index, gop_pos = self._get_train_frame_index( num_frames, seg) else: gop_index, gop_pos = self._get_test_frame_index( num_frames, seg) img = load(video_path, gop_index, gop_pos, representation_idx, self._accumulate) if img is None: print('Error: loading video %s failed.' % video_path) img = np.zeros( (256, 256, 2)) if self._representation == 'mv' else np.zeros( (256, 256, 3)) else: if self._representation == 'mv': img = clip_and_scale(img, 20) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) elif self._representation == 'residual': img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) if self._representation == 'iframe': img = color_aug(img) # BGR to RGB. (PyTorch uses RGB according to doc.) img = img[..., ::-1] frames.append(img) frames = self._transform(frames) frames = np.array(frames) frames = np.transpose(frames, (0, 3, 1, 2)) input = torch.from_numpy(frames).float() / 255.0 if self._representation == 'iframe': input = (input - self._input_mean) / self._input_std elif self._representation == 'residual': input = (input - 0.5) / self._input_std elif self._representation == 'mv': input = (input - 0.5) return input, label
def __getitem__(self, index): if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 else: representation_idx = 0 if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] frames = [] idx_first = -99999 for seg in range(self._num_segments): if self._is_train: gop_index, gop_pos = self._get_train_frame_index( num_frames, seg) else: gop_index, gop_pos = self._get_test_frame_index( num_frames, seg) flow_path = video_path_to_flow_path(self._flow_root, video_path) if self._flow_folder == 'tvl1': flow_tmpl = 'flow_{0}_{1:05d}.jpg' idx = gop_index * GOP_SIZE + gop_pos + 1 if idx_first == -99999: idx_first = idx # read the corresponding pre-computed optical flow along x and y dimension x_img = np.array( Image.open(os.path.join(flow_path, flow_tmpl.format('x', idx))).convert('L')) y_img = np.array( Image.open(os.path.join(flow_path, flow_tmpl.format('y', idx))).convert('L')) flow = np.stack([x_img, y_img], axis=-1) if flow is None: print('Error: loading flow %s failed.' % video_path) # load MV and data pre-processing mv = load(video_path, gop_index, gop_pos, representation_idx, self._accumulate) if mv is None: print('Error: loading video %s failed.' % video_path) mv = np.zeros( (256, 256, 2)) if self._representation == 'mv' else np.zeros( (256, 256, 3)) else: if self._representation == 'mv': if self._mv_minmaxnorm == 1: mv = clip_and_scale( mv, 20) # scale values from +-20 to +-127.5 mv += 128 mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8) elif self._representation == 'residual': mv += 128 mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8) if self._representation == 'iframe': mv = color_aug(mv) # BGR to RGB. (PyTorch uses RGB according to doc.) mv = mv[..., ::-1] # load residual and data pre-processing residual = load(video_path, gop_index, gop_pos, 2, self._accumulate) residual += 128 residual = (np.minimum(np.maximum(residual, 0), 255)).astype(np.uint8) frames.append(np.concatenate((flow, mv, residual), axis=2)) frames = self._transform(frames) frames = np.array(frames) frames = np.transpose(frames, (0, 3, 1, 2)) # print('frames shape in dataloader:') # print(frames.shape) # (num_crops*num_segments, 5, 224, 224) # split input into input_mv and input_flow input_flow = frames[:, 0:2, :, :] input_mv = frames[:, 2:4, :, :] input_residual = frames[:, 4:, :, :] if self._flow_ds_factor is not 0: # downsample to make OF blocky factor = self._flow_ds_factor w_max = input_flow.shape[2] h_max = input_flow.shape[3] input_flow = block_reduce(input_flow, block_size=(1, 1, factor, factor), func=np.mean) # resize to original size by repeating or interpolation if self._upsample_interp is False: input_flow = input_flow.repeat(factor, axis=2).repeat(factor, axis=3) else: # interpolate along certain dimension? only interp1d can do so w_max_ds = input_flow.shape[2] h_max_ds = input_flow.shape[3] f_out = interpolate.interp1d(np.linspace(0, 1, w_max_ds), input_flow, kind='linear', axis=2) input_flow = f_out(np.linspace(0, 1, w_max_ds * factor)) f_out = interpolate.interp1d(np.linspace(0, 1, h_max_ds), input_flow, kind='linear', axis=3) input_flow = f_out(np.linspace(0, 1, h_max_ds * factor)) input_flow = input_flow[:, :, :w_max, :h_max] """load data from numpy to torch and pre-processing""" # print('input_flow shape in dataloader:') # print(input_flow.shape) # (num_crops*num_segments, 2, 224, 224) input_flow = torch.from_numpy(input_flow).float() / 255.0 input_mv = torch.from_numpy(input_mv).float() / 255.0 input_residual = torch.from_numpy(input_residual).float() / 255.0 # print('input_flow after torch shape in dataloader:') # print(input_flow.shape) # torch.Size([num_crops*num_segments, 2, 224, 224]) if self._representation == 'iframe': input_mv = (input_mv - self._input_mean) / self._input_std elif self._representation == 'mv': input_mv = (input_mv - 0.5) / torch.mean(self._input_std) input_flow = (input_flow - 0.5) / torch.mean(self._input_std) input_residual = (input_residual - 0.5) / self._input_std # print('Input flow shape %s:' % str(input_flow.shape)) # torch.Size([1, num_crops*num_segments, 2, 224, 224]) # print('Input mv shape %s:' % str(input_mv.shape)) # print('Input residual shape %s:' % str(input_residual.shape)) # print('Input mv scope min %s:' % str(input_mv.min())) # print('Input mv scope max %s:' % str(input_mv.max())) # print('Input flow scope min %s:' % str(input_flow.min())) # print('Input flow scope max %s:' % str(input_flow.max())) if (self._viz == True) and (self._is_train == False): classname = flow_path.split('/')[-2] img_tmpl = 'img_{:05d}.jpg' # idx is the index of the first frame/segment of the current video return input_flow, input_mv, input_residual, label, os.path.join( flow_path, flow_tmpl.format('x', idx)), os.path.join( flow_path, flow_tmpl.format('y', idx)), os.path.join( flow_path, img_tmpl.format(idx_first)), classname else: return input_flow, input_mv, input_residual, label
def __getitem__(self, index): """ Args: index (int): Index Returns: tuple: (image, target) where target is class_index of the target class. """ if self.opt.video_index != -1: index = self.opt.video_index #print(self.data[index]['frame_indices']) jpg_dir_path = self.data[index]['jpeg_path'] num_frames = self.data[index]['n_frames'] gop_size = 12 num_of_gops = int(num_frames/gop_size) -1 #+ 1 gop_index = int(self.data[index]['frame_indices'][0] / 12) frame_start = gop_index % 12 iframe_path_img = os.path.join(self.data[index]['iframe_path'],'iframe_') + str(gop_index) + '_' + str(0) + '.png' if not os.path.exists(iframe_path_img): gop_index -= 1 iframe_path_img = os.path.join(self.data[index]['iframe_path'],'iframe_') + str(gop_index) + '_' + str(0) + '.png' clip = [] path = jpg_dir_path = self.data[index]['video'] # print("self.data[index]['frame_indices'][0]",self.data[index]['frame_indices'][0]) if not self.opt.residual_only: if self.data[index]['frame_indices'][0] % 12 == 0: iframe_path_img = os.path.join(self.data[index]['iframe_path'],'iframe_') + str(gop_index) + '_' + str(0) + '.png' iframe = pil_loader(iframe_path_img) else: iframe = load(path,gop_index,frame_start, 0, True) iframe =self.tensor_2_image(iframe) clip.append(iframe) for frame_index in range(1,12): residual_path_img = os.path.join(self.data[index]['residual_path'],'residuals_') + str(gop_index) + '_' + str(frame_index) + '.png' if os.path.exists(residual_path_img): if self.data[index]['frame_indices'][0] % 12 == 0: residual = pil_loader(residual_path_img) else: frame_start += 1 residual = load(path,gop_index,frame_index, 2, True) residual += 128 residual = (np.minimum(np.maximum(residual, 0), 255)).astype(np.uint8) residual = self.tensor_2_image(residual) if self.opt.residual_only and frame_index == 1: clip.append(residual) clip.append(residual) else: gop_index -= 1 residual_path_img = os.path.join(self.data[index]['residual_path'],'residuals_') + str(gop_index) + '_' + str(frame_index) + '.png' residual = pil_loader(residual_path_img) clip.append(residual) continue if self.spatial_transform is not None: self.spatial_transform.randomize_parameters() clip = [self.spatial_transform(img) for img in clip] if len(clip) < 12: delta = 12 - len(clip) dup = clip[-1] for k in range(delta): clip.append(dup) clip = torch.stack(clip, 0).permute(1, 0, 2, 3) if self.opt.add: l = 0 iframe = clip[:,0,:,:] for mat in clip: if l != 0: residual_frame = clip[:,l,:,:] clip [:,l,:,:] = (residual_frame + iframe)/2 l += 1 target = self.data[index] if self.target_transform is not None: target = self.target_transform(target) if self.opt.video_level_accuracy: return clip, target, self.data[index]['video'].split('/')[-1] else: return clip, target
def __getitem__(self, index): if self._is_train: video_path, label, num_frames = random.choice(self._video_list) else: video_path, label, num_frames = self._video_list[index] iframes = [] mvs = [] for seg in range(self._num_segments): # for iframe if self._is_train: gop_index, gop_pos = self._get_train_frame_index( num_frames, seg, self._num_segments, 'iframe') else: gop_index, gop_pos = self._get_test_frame_index( num_frames, seg, self._num_segments, 'iframe') img = load(video_path, gop_index, gop_pos, IFRAME, self._accumulate) if img is None: print('Error: loading video %s failed.' % video_path) img = np.zeros((224, 224, 3)) # img = color_aug(img) # BGR to RGB. (PyTorch uses RGB according to doc.) img = img[..., ::-1] iframes.append(img) for seg in range(self._num_segments * self.alpha): # for mv .notice here we should use the same gop_index with iframe if self._is_train: gop_index, gop_pos = self._get_train_frame_index( num_frames, seg, self._num_segments * self.alpha, 'mv') else: gop_index, gop_pos = self._get_test_frame_index( num_frames, seg, self._num_segments * self.alpha, 'mv') mv = load(video_path, gop_index, gop_pos, MV, self._accumulate) if mv is None: print('Error: loading video %s failed.' % video_path) mv = np.zeros((224, 224, 2)) mv = clip_and_scale(mv, 20) # scale up the value mv += 128 mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8) mvs.append(mv) # preprocess iframe iframes = self._iframe_transform( iframes) if self._is_train else self._infer_transform(iframes) iframes = np.asarray(iframes, dtype=np.float32) / 255.0 iframes = np.transpose(iframes, (3, 0, 1, 2)) iframes = (iframes - self._input_mean) / self._input_std # preprocess mv mvs = self._mv_transform( mvs) if self._is_train else self._infer_transform(mvs) mvs = np.asarray(mvs, dtype=np.float32) / 255.0 mvs = np.transpose(mvs, (3, 0, 1, 2)) mvs = (mvs - 0.5) # check the shape # channels,depth, width, height assert iframes.shape[1] == self._num_segments, print( "iframe shape wrong") assert mvs.shape[1] == self._num_segments * self.alpha, print( "timesacle shape wrong") return (iframes, mvs), label
num_frames = video[4] # 195 print(video_name) pickle_folder = os.path.join(pickle_root, folder, video_name) iframe_folder = os.path.join(pickle_folder, "iframe") mv_folder = os.path.join(pickle_folder, "mv") if os.path.exists(iframe_folder) is False: os.makedirs(iframe_folder) if os.path.exists(mv_folder) is False: os.makedirs(mv_folder) for i in range(num_frames): gop_index, gop_pos = get_gop_pos(i, 'iframe') iframe_img = load(video_path, gop_index, gop_pos, 0, True) # I-frame iframe_path = os.path.join(iframe_folder, str(gop_index) + "_" + str(gop_pos) + ".p") with open(iframe_path, 'wb') as iframe_pickle: pickle.dump(iframe_img, iframe_pickle) if i > 0: gop_index, gop_pos = get_gop_pos(i, 'mv') mv_img = load(video_path, gop_index, gop_pos, 1, True) # mv mv_path = os.path.join(mv_folder, str(gop_index) + "_" + str(gop_pos) + ".p") with open(mv_path, 'wb') as mv_pickle: pickle.dump(mv_img, mv_pickle) print("Done!")
def show_boxes_in_compressed_video(video_path, update_ms=10, min_confidence=0.0, box_file_path=None, min_frame_idx=None, max_frame_idx=None, frame_interval=1, frame_type=0, accumulate=False): """ This function show box :param box_file_path: string, the path of the boxes. The format of this file should be the same with MOTChallenge det.txt or gt.txt. :param video_path: string, the path of frames. :param update_ms: scalar, 1000 / update_ms is the fps, default 10 :param min_confidence: float, the confidence threshold of detection, the boxes with smaller confidence will not be displayed. Default 0.0 :param min_frame_idx: integer, the first frame to display, default the first frame of this sequence :param max_frame_idx: integer, the last frame to display, default the last frame of this sequence :param frame_interval: the interval to show frames :param frame_type: int, can be 0, 1 or 2 (denotes I frame, motion vector, residual, respectively) :param accumulate: used for the motion vector and residual. If it is true, the motion vector and residual are accumulated. :return: None """ def frame_callback(vis, frame_idx): # print('Processing frame: ', frame_idx) # Load image and generate detections. # Update visualization. GROUP_SIZE = 12 # the number of frames in one group. We set to 12 for the raw mpeg4 video. gop_idx = int( (frame_idx - 1) / GROUP_SIZE ) # GOP starts from 0, while frame_idx here starts from 1. in_group_idx = int( (frame_idx - 1) % GROUP_SIZE) # the index in the group image = coviar.load(video_path, gop_idx, in_group_idx, frame_type, accumulate) image = compressed_frame_to_show(image, frame_type, tool_type='cv2') vis.set_image(image.copy(), frame_idx) raw_box = seq_info['boxes'] if raw_boxes is not None: index = raw_box[:, 0] == frame_idx box = raw_box[index] index = box[:, 6] >= min_confidence box = box[index] box = box[:, 1:7] # [target_id, x, y, w, h] box_list = [] for idx in range(box.shape[0]): box_list.append(box[idx, :]) vis.draw_box(box_list) total_frames = coviar.get_num_frames(video_path) + 1 # get the first and las frame index if min_frame_idx is None: min_frame_idx = 1 if min_frame_idx < 0 or min_frame_idx > total_frames: min_frame_idx = 1 if max_frame_idx is None: max_frame_idx = total_frames if max_frame_idx < 0 or max_frame_idx > total_frames: max_frame_idx = total_frames if min_frame_idx > max_frame_idx: raise RuntimeError('The first frame index ', min_frame_idx, ' is larger than the last frame index ', max_frame_idx) # get the sequence information im = coviar.load(video_path, 0, 0, 0, False) im_size = im.shape raw_boxes = None if box_file_path is None else np.loadtxt( box_file_path, dtype=float, delimiter=',') seq_info = { 'image_size': [im_size[0], im_size[1]], 'min_frame_idx': min_frame_idx, 'max_frame_idx': max_frame_idx, 'frame_interval': frame_interval, 'boxes': raw_boxes, 'sequence_name': '' } visualizer = Visualization(seq_info, update_ms) visualizer.run(frame_callback)