def _loadvid_test_vanilla(filename, width, height): """Tests the usual loadvid call, with a default FPS cap. The input file, an encoded video corresponding to `filename`, is repeatedly decoded (with a random seek). The first and last of the returned frames are plotted using `matplotlib.pyplot`. """ with open(filename, 'rb') as f: encoded_video = f.read() num_frames = 32 for _ in range(10): start = time.perf_counter() decoded_frames, _ = lintel.loadvid(encoded_video, should_random_seek=True, width=width, height=height, num_frames=num_frames) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(num_frames, height, width, 3)) end = time.perf_counter() print('time: {}'.format(end - start)) plt.imshow(decoded_frames[0, ...]) plt.show() plt.imshow(decoded_frames[-1, ...]) plt.show()
def __read_video_with_lintel(self, sample_name, indices=None): file = self.rgb_directory + '/' + sample_name + '_rgb.avi' fin = open(file, 'rb') video = fin.read() Dataset = namedtuple('Dataset', 'width height num_frames') dataset = Dataset(1920, 1080, None) if indices: video = lintel.loadvid_frame_nums(video, frame_nums=indices, width=dataset.width, height=dataset.height) else: video, seek_distance = lintel.loadvid(video, should_random_seek=True, width=dataset.width, height=dataset.height) video = np.frombuffer(video, dtype=np.uint8) video = np.reshape(video, newshape=(-1, dataset.height, dataset.width, 3)) fin.close() result = [] if self.image_transforms: for i in range(len(video)): result.append(self.image_transforms(video[i])) return torch.stack(result)
def __getitem__(self, index): vid, cls = self.data[index] with open(vid, 'rb') as f: enc_vid = f.read() for i in range(10): df, w, h, temp = lintel.loadvid(enc_vid, should_random_seek=self.random, width=0, height=0, num_frames=self.length * 2) df = np.frombuffer(df, dtype=np.uint8) xp = np.mean(df) if (xp > 1): break #print('[lintel]', xp) w = w // 2 h = h // 2 # center crop if not self.random: i = int(round((h - self.size) / 2.)) j = int(round((w - self.size) / 2.)) df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:-i, j:-j, :] else: th = self.size tw = self.size i = random.randint(0, h - th) if h != th else 0 j = random.randint(0, w - tw) if w != tw else 0 df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:i + th, j:j + tw, :] if self.mode == 'flow': #print(df[:,:,:,1:].mean()) #exit() # only take the 2 channels corresponding to flow (x,y) df = df[:, :, :, 1:] if self.model == '2d': # this should be redone... # stack 10 along channel axis df = np.asarray([df[:10], df[2:12], df[4:14]]) # gives 3x10xHxWx2 df = df.transpose(0, 1, 4, 2, 3).reshape(3, 20, self.size, self.size).transpose(0, 2, 3, 1) df = 1 - 2 * (df.astype(np.float32) / 255) #print('[lintel test]', df.shape) if self.model == '2d': # 2d -> return TxCxHxW df = df.transpose([0, 3, 1, 2]) #print('[lintel test]', df.shape) return df, cls # 3d -> return CxTxHxW return df.transpose([3, 0, 1, 2]), cls
def _loadvid_test_vanilla(filename, width, height): """Tests the usual loadvid call. The input file, is repeatedly decoded (with a random seek). The first and last of the returned frames are plotted using `matplotlib.pyplot`. """ num_frames = 32 for _ in range(10): start = time.perf_counter() result = lintel.loadvid(filename, should_random_seek=True, width=width, height=height, num_frames=num_frames) # NOTE(brendan): dynamic size returns (frames, width, height, # seek_distance). if (width == 0) and (height == 0): decoded_frames, width, height, _ = result else: decoded_frames, _ = result decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(num_frames, height, width, 3)) end = time.perf_counter() print('time: {}'.format(end - start)) plt.imshow(decoded_frames[0, ...]) plt.show() plt.imshow(decoded_frames[-1, ...]) plt.show()
def __getitem__(self, index): vid = self.vids[index] cls = self.data[vid] if not os.path.exists(os.path.join(self.root, vid)): if self.mode == 'flow' and self.model == '2d': return np.zeros((3, 20, self.size, self.size), dtype=np.float32), 0 elif self.mode == 'flow' and self.model == '3d': return np.zeros((2, self.length, self.size, self.size), dtype=np.float32), 0 with open(os.path.join(self.root, vid), 'rb') as f: enc_vid = f.read() df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length*2) df = np.frombuffer(df, dtype=np.uint8) if w < 128 or h < 128 or h > 512 or w > 512: df = np.zeros((self.length*2,128,128,3), dtype=np.uint8) w=h=128 cls = 0 # center crop if not self.random: i = int(round((h-self.size)/2.)) j = int(round((w-self.size)/2.)) df = np.reshape(df, newshape=(self.length*2, h, w, 3))[::2, i:-i, j:-j, :] else: th = self.size tw = self.size #print(h, th, h-th) i = random.randint(0, h - th) if h!=th else 0 j = random.randint(0, w - tw) if w!=tw else 0 df = np.reshape(df, newshape=(self.length*2, h, w, 3))[::2, i:i+th, j:j+tw, :] if random.random() < 0.5: df = np.flip(df, axis=2).copy() if self.mode == 'flow': #print(df[:,:,:,1:].mean()) #exit() # only take the 2 channels corresponding to flow (x,y) df = df[:,:,:,1:] if self.model == '2d': # this should be redone... # stack 10 along channel axis df = np.asarray([df[:10],df[2:12],df[4:14]]) # gives 3x10xHxWx2 df = df.transpose(0,1,4,2,3).reshape(3,20,self.size,self.size).transpose(0,2,3,1) df = 1-2*(df.astype(np.float32)/255) if self.model == '2d': # 2d -> return TxCxHxW return df.transpose([0,3,1,2]), cls # 3d -> return CxTxHxW return df.transpose([3,0,1,2]), cls
def __getitem__(self, index): vid, cls = self.data[index] with open(vid, 'rb') as f: enc_vid = f.read() # 随机抽取32帧图片 df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length * 2) # 通过lintel模块直接将视频转成帧 df = np.frombuffer(df, dtype=np.uint8) # 将原图片的h,w减半 w = w // 2 h = h // 2 # center crop if not self.random: i = int(round((h - self.size) / 2.)) j = int(round((w - self.size) / 2.)) df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:-i, j:-j, :] else: # 进行随机裁剪 th = self.size tw = self.size i = random.randint(0, h - th) if h != th else 0 j = random.randint(0, w - tw) if w != tw else 0 df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:i + th, j:j + tw, :] if self.mode == 'flow': #print(df[:,:,:,1:].mean()) #exit() # only take the 2 channels corresponding to flow (x,y) df = df[:, :, :, 1:] # 如果mode == 'flow',那么只取后两个通道作为灰度图 if self.model == '2d': # this should be redone... # stack 10 along channel axis df = np.asarray([df[:10], df[2:12], df[4:14]]) # gives 3x10xHxWx2 df = df.transpose(0, 1, 4, 2, 3).reshape( 3, 20, self.size, self.size).transpose(0, 2, 3, 1) # (3,H,W,20) df = 1 - 2 * (df.astype(np.float32) / 255) # 将像数值归一化 if self.model == '2d': # 2d -> return TxCxHxW return df.transpose([0, 3, 1, 2]), cls # 调整输入通道来满足pytorch的需求 # 3d -> return CxTxHxW return df.transpose([3, 0, 1, 2]), cls
def _sample_frame_sequence_to_4darray(video, dataset, should_random_seek, fps_cap): """Called to extract a frame sequence `dataset.num_frames` long, sampled uniformly from inside `video`, to a 4D numpy array. . Args: video: Encoded video. dataset: Dataset meta-info, e.g., width and height. should_random_seek: If set to `True`, then `lintel.loadvid` will start decoding from a uniformly random seek point in the video (with enough space to decode the requested number of frames). The seek distance will be returned, so that if the label of the data depends on the timestamp, then the label can be dynamically set. fps_cap: The _maximum_ framerate that will be captured from the video. Excess frames will be dropped, i.e., if `fps_cap` is 30 for a video with a 60 fps framerate, every other frame will be dropped. Returns: A tuple (frames, seek_distance) where `frames` is a 4-D numpy array loaded from the byte array returned by `lintel.loadvid`, and `seek_distance` is the number of seconds into `video` that decoding started from. Note that the random seeking can be turned off. Use _sample_frame_sequence_to_4darray in your PyTorch Dataset object, which subclasses torch.utils.data.Dataset. Call _sample_frame_sequence_to_4darray in __getitem__. This means that for every minibatch, for each example, a random keyframe in the video is seeked to and num_frames frames are decoded from there. num_frames would normally tend to be small (if you were going to use them as input to a 3D ConvNet or optical flow algorithm), e.g., 32 frames. """ video, seek_distance = lintel.loadvid( video, should_random_seek=should_random_seek, width=dataset.width, height=dataset.height, num_frames=dataset.num_frames, fps_cap=fps_cap) video = np.frombuffer(video, dtype=np.uint8) video = np.reshape(video, newshape=(dataset.num_frames, dataset.height, dataset.width, 3)) return video, seek_distance
def __getitem__(self, index): index = index % self.total_frames # we want bisect_rigtht here so that the first frame in a file gets the # file, not the previous file file_index = bisect.bisect_right(self.start_index, index) frame = index - self.start_index[file_index - 1] if file_index > 0 else index filename = self.files[file_index] video = self.videos[file_index] frames, seek_distance = lintel.loadvid( video, should_random_seek=True, width=self.dataset.width, height=self.dataset.height, num_frames=self.dataset.num_frames, fps_cap=60) frames = np.frombuffer(frames, dtype=np.uint8) frames = np.reshape(frames, newshape=(self.dataset.num_frames, self.dataset.height, self.dataset.width, 3)) for i in range(self.frames): #TODO(jbarker): Tidy this up and remove redundant computation if i == 0 and self.is_cropped: crop_x = random.randint( 0, self.image_shape[1] - self.frame_size[1]) crop_y = random.randint( 0, self.image_shape[0] - self.frame_size[0]) elif self.is_cropped == False: crop_x = math.floor( (self.image_shape[1] - self.frame_size[1]) / 2) crop_y = math.floor( (self.image_shape[0] - self.frame_size[0]) / 2) self.crop_size = self.frame_size image = frames[i, crop_y:crop_y + self.crop_size[0], crop_x:crop_x + self.crop_size[1], :] self.frame_buffer[:, i, :, :] = np.rollaxis(image, 2, 0) return torch.from_numpy(self.frame_buffer)
def __getitem__(self, index): vid = self.vids[index] # get video at correct index classification = self.data[vid] # get label from correct vid if not os.path.exists(os.path.join( self.root, vid)): # if vid cannot be found as it's flow? if self.mode == 'flow' and self.model == '2d': return np.zeros( (3, 20, self.size, self.size), dtype=np.float32), 0 # return zeros if not found elif self.mode == 'flow' and self.model == '3d': return np.zeros( (2, self.length, self.size, self.size), dtype=np.float32), 0 # return zeros if not found # open path now video must exist with open(os.path.join(self.root, vid), 'rb') as f: enc_vid = f.read() # read file binary # binary data = raw data => read file raw data # loading vid into lintel # obtaining dataframe width and height of video # df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length*2) df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random) # interpret buffer as 1 dimensional array df = np.frombuffer(df, dtype=np.uint8) # unsigned 8 bit integer # if w < 128 or h < 128 or h > 512 or w > 512: # if video too big or too small # # crop df to 128x128 # df = np.zeros( # (self.length*2, # number of frames? # 128, # height # 128, # width # 3), # colour channels? # dtype=np.uint8 # ) # w = h = 128 # classification = 0 # set classification to 0 if video is cropped # center crop # applying random croppings -> different each time video is loaded print(df.shape[0] / (h * w * 3)) print("l", self.length * 2) if not self.random: i = int(round((h - self.size) / 2.)) j = int(round((w - self.size) / 2.)) df = np.reshape(df, newshape=(self.length * 2, h, w, 3))[::2, i:-i, j:-j, :] else: th = self.size tw = self.size #print(h, th, h-th) i = random.randint(0, h - th) if h != th else 0 j = random.randint(0, w - tw) if w != tw else 0 df = np.reshape(df, newshape=(self.length * 2, h, w, 3))[::2, i:i + th, j:j + tw, :] # randomly flip if random.random() < 0.5: df = np.flip(df, axis=2).copy() if self.mode == 'flow': #print(df[:,:,:,1:].mean()) #exit() # only take the 2 channels corresponding to flow (x,y) # t+1, ..., t df = df[:, :, :, 1:] if self.model == '2d': # this should be redone... # stack 10 along channel axis df = np.asarray([df[:10], df[2:12], df[4:14]]) # gives 3x10xHxWx2 df = df.transpose(0, 1, 4, 2, 3).reshape(3, 20, self.size, self.size).transpose(0, 2, 3, 1) # normalise data df = 1 - 2 * (df.astype(np.float32) / 255) if self.model == '2d': # 2d -> return TxCxHxW return df.transpose([0, 3, 1, 2]), classification # 3d -> return CxTxHxW return df.transpose([3, 0, 1, 2]), classification
def __getitem__(self, index): vid, cls = self.data[index] #print(vid) with open(vid, 'rb') as f: enc_vid = f.read() # test = r"/home/qinz/representation-flow-cvpr19/HMDB/pour/Spectacular_tea_pouring_pour_u_cm_np1_ri_med_0.avi" # with open(test, 'rb') as f2: # enc_vid = f2.read() df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length * 2) # print("w, h: ", vid, w, h) df = np.frombuffer(df, dtype=np.uint8) # print("df_pre: ", df.shape) reduce_flag = False if w >= 2 * self.size and h >= 2 * self.size: w = w // 2 h = h // 2 reduce_flag = True # center crop if not self.random: # print(w, h) i = int(round((h - self.size) / 2.)) j = int(round((w - self.size) / 2.)) # print("i,j" , i, j) df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:-i, j:-j, :] else: th = self.size tw = self.size # print(vid, h - th) i = random.randint(0, h - th) if h != th else 0 j = random.randint(0, w - tw) if w != tw else 0 if reduce_flag == True: df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2, 3))[::2, ::2, ::2, :][:, i:i + th, j:j + tw, :] else: df = np.reshape(df, newshape=(self.length * 2, h, w, 3))[::2, :, :, :][:, i:i + th, j:j + tw, :] # print("df_size: ", df.shape) if self.mode == 'flow': #print(df[:,:,:,1:].mean()) #exit() # only take the 2 channels corresponding to flow (x,y) df = df[:, :, :, 1:] if self.model == '2d': # this should be redone... # stack 10 along channel axis df = np.asarray([df[:10], df[2:12], df[4:14]]) # gives 3x10xHxWx2 df = df.transpose(0, 1, 4, 2, 3).reshape(3, 20, self.size, self.size).transpose(0, 2, 3, 1) df = 1 - 2 * (df.astype(np.float32) / 255) if self.model == '2d': # 2d -> return TxCxHxW return df.transpose([0, 3, 1, 2]), cls # 3d -> return CxTxHxW return df.transpose([3, 0, 1, 2]), cls