Exemplo n.º 1
0
def _loadvid_test_vanilla(filename, width, height):
    """Tests the usual loadvid call, with a default FPS cap.

    The input file, an encoded video corresponding to `filename`, is repeatedly
    decoded (with a random seek). The first and last of the returned frames are
    plotted using `matplotlib.pyplot`.
    """
    with open(filename, 'rb') as f:
        encoded_video = f.read()

    num_frames = 32
    for _ in range(10):
        start = time.perf_counter()
        decoded_frames, _ = lintel.loadvid(encoded_video,
                                           should_random_seek=True,
                                           width=width,
                                           height=height,
                                           num_frames=num_frames)
        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        decoded_frames = np.reshape(decoded_frames,
                                    newshape=(num_frames, height, width, 3))
        end = time.perf_counter()

        print('time: {}'.format(end - start))
        plt.imshow(decoded_frames[0, ...])
        plt.show()
        plt.imshow(decoded_frames[-1, ...])
        plt.show()
Exemplo n.º 2
0
 def __read_video_with_lintel(self, sample_name, indices=None):
     file = self.rgb_directory + '/' + sample_name + '_rgb.avi'
     fin = open(file, 'rb')
     video = fin.read()
     Dataset = namedtuple('Dataset', 'width height num_frames')
     dataset = Dataset(1920, 1080, None)
     if indices:
         video = lintel.loadvid_frame_nums(video,
                                           frame_nums=indices,
                                           width=dataset.width,
                                           height=dataset.height)
     else:
         video, seek_distance = lintel.loadvid(video,
                                               should_random_seek=True,
                                               width=dataset.width,
                                               height=dataset.height)
     video = np.frombuffer(video, dtype=np.uint8)
     video = np.reshape(video,
                        newshape=(-1, dataset.height, dataset.width, 3))
     fin.close()
     result = []
     if self.image_transforms:
         for i in range(len(video)):
             result.append(self.image_transforms(video[i]))
     return torch.stack(result)
Exemplo n.º 3
0
    def __getitem__(self, index):
        vid, cls = self.data[index]
        with open(vid, 'rb') as f:
            enc_vid = f.read()

        for i in range(10):
            df, w, h, temp = lintel.loadvid(enc_vid,
                                            should_random_seek=self.random,
                                            width=0,
                                            height=0,
                                            num_frames=self.length * 2)
            df = np.frombuffer(df, dtype=np.uint8)
            xp = np.mean(df)
            if (xp > 1):
                break

        #print('[lintel]', xp)

        w = w // 2
        h = h // 2

        # center crop
        if not self.random:
            i = int(round((h - self.size) / 2.))
            j = int(round((w - self.size) / 2.))
            df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:-i,
                                                                j:-j, :]
        else:
            th = self.size
            tw = self.size
            i = random.randint(0, h - th) if h != th else 0
            j = random.randint(0, w - tw) if w != tw else 0
            df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:i + th,
                                                                j:j + tw, :]

        if self.mode == 'flow':
            #print(df[:,:,:,1:].mean())
            #exit()
            # only take the 2 channels corresponding to flow (x,y)
            df = df[:, :, :, 1:]
            if self.model == '2d':
                # this should be redone...
                # stack 10 along channel axis
                df = np.asarray([df[:10], df[2:12],
                                 df[4:14]])  # gives 3x10xHxWx2
                df = df.transpose(0, 1, 4, 2,
                                  3).reshape(3, 20, self.size,
                                             self.size).transpose(0, 2, 3, 1)

        df = 1 - 2 * (df.astype(np.float32) / 255)
        #print('[lintel test]', df.shape)
        if self.model == '2d':
            # 2d -> return TxCxHxW
            df = df.transpose([0, 3, 1, 2])
            #print('[lintel test]', df.shape)
            return df, cls
        # 3d -> return CxTxHxW
        return df.transpose([3, 0, 1, 2]), cls
Exemplo n.º 4
0
def _loadvid_test_vanilla(filename, width, height):
    """Tests the usual loadvid call.

    The input file, is repeatedly
    decoded (with a random seek). The first and last of the returned frames are
    plotted using `matplotlib.pyplot`.
    """
    num_frames = 32
    for _ in range(10):
        start = time.perf_counter()
        result = lintel.loadvid(filename,
                                should_random_seek=True,
                                width=width,
                                height=height,
                                num_frames=num_frames)

        # NOTE(brendan): dynamic size returns (frames, width, height,
        # seek_distance).
        if (width == 0) and (height == 0):
            decoded_frames, width, height, _ = result
        else:
            decoded_frames, _ = result

        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        decoded_frames = np.reshape(decoded_frames,
                                    newshape=(num_frames, height, width, 3))
        end = time.perf_counter()

        print('time: {}'.format(end - start))
        plt.imshow(decoded_frames[0, ...])
        plt.show()
        plt.imshow(decoded_frames[-1, ...])
        plt.show()
Exemplo n.º 5
0
    def __getitem__(self, index):
        vid = self.vids[index]
        cls = self.data[vid]
        if not os.path.exists(os.path.join(self.root, vid)):
            if self.mode == 'flow' and self.model == '2d':
                return np.zeros((3, 20, self.size, self.size), dtype=np.float32), 0
            elif self.mode == 'flow' and self.model == '3d':
                return np.zeros((2, self.length, self.size, self.size), dtype=np.float32), 0
            

        with open(os.path.join(self.root, vid), 'rb') as f:
            enc_vid = f.read()

        
        df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length*2)
        df = np.frombuffer(df, dtype=np.uint8)

        if w < 128 or h < 128 or h > 512 or w > 512:
            df = np.zeros((self.length*2,128,128,3), dtype=np.uint8)
            w=h=128
            cls = 0

        # center crop
        if not self.random:
            i = int(round((h-self.size)/2.))
            j = int(round((w-self.size)/2.))
            df = np.reshape(df, newshape=(self.length*2, h, w, 3))[::2, i:-i, j:-j, :]
        else:
            th = self.size
            tw = self.size
            #print(h, th, h-th)
            i = random.randint(0, h - th) if h!=th else 0
            j = random.randint(0, w - tw) if w!=tw else 0
            df = np.reshape(df, newshape=(self.length*2, h, w, 3))[::2, i:i+th, j:j+tw, :]

            if random.random() < 0.5:
                df = np.flip(df, axis=2).copy()

        if self.mode == 'flow':
            #print(df[:,:,:,1:].mean())
            #exit()
            # only take the 2 channels corresponding to flow (x,y)
            df = df[:,:,:,1:]
            if self.model == '2d':
                # this should be redone...
                # stack 10 along channel axis
                df = np.asarray([df[:10],df[2:12],df[4:14]]) # gives 3x10xHxWx2
                df = df.transpose(0,1,4,2,3).reshape(3,20,self.size,self.size).transpose(0,2,3,1)
            
                
        df = 1-2*(df.astype(np.float32)/255)

        if self.model == '2d':
            # 2d -> return TxCxHxW
            return df.transpose([0,3,1,2]), cls
        # 3d -> return CxTxHxW
        return df.transpose([3,0,1,2]), cls
Exemplo n.º 6
0
    def __getitem__(self, index):
        vid, cls = self.data[index]

        with open(vid, 'rb') as f:
            enc_vid = f.read()

        # 随机抽取32帧图片
        df, w, h, _ = lintel.loadvid(enc_vid,
                                     should_random_seek=self.random,
                                     num_frames=self.length *
                                     2)  # 通过lintel模块直接将视频转成帧
        df = np.frombuffer(df, dtype=np.uint8)

        # 将原图片的h,w减半
        w = w // 2
        h = h // 2

        # center crop
        if not self.random:
            i = int(round((h - self.size) / 2.))
            j = int(round((w - self.size) / 2.))
            df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:-i,
                                                                j:-j, :]
        else:
            # 进行随机裁剪
            th = self.size
            tw = self.size
            i = random.randint(0, h - th) if h != th else 0
            j = random.randint(0, w - tw) if w != tw else 0
            df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:i + th,
                                                                j:j + tw, :]

        if self.mode == 'flow':
            #print(df[:,:,:,1:].mean())
            #exit()
            # only take the 2 channels corresponding to flow (x,y)
            df = df[:, :, :, 1:]  # 如果mode == 'flow',那么只取后两个通道作为灰度图
            if self.model == '2d':
                # this should be redone...
                # stack 10 along channel axis
                df = np.asarray([df[:10], df[2:12],
                                 df[4:14]])  # gives 3x10xHxWx2
                df = df.transpose(0, 1, 4, 2, 3).reshape(
                    3, 20, self.size, self.size).transpose(0, 2, 3,
                                                           1)  # (3,H,W,20)

        df = 1 - 2 * (df.astype(np.float32) / 255)  # 将像数值归一化

        if self.model == '2d':
            # 2d -> return TxCxHxW
            return df.transpose([0, 3, 1, 2]), cls  # 调整输入通道来满足pytorch的需求
        # 3d -> return CxTxHxW
        return df.transpose([3, 0, 1, 2]), cls
def _sample_frame_sequence_to_4darray(video, dataset, should_random_seek,
                                      fps_cap):
    """Called to extract a frame sequence `dataset.num_frames` long, sampled
    uniformly from inside `video`, to a 4D numpy array.
.
    Args:
        video: Encoded video.
        dataset: Dataset meta-info, e.g., width and height.
        should_random_seek: If set to `True`, then `lintel.loadvid` will start
            decoding from a uniformly random seek point in the video (with
            enough space to decode the requested number of frames).
            The seek distance will be returned, so that if the label of the
            data depends on the timestamp, then the label can be dynamically
            set.
        fps_cap: The _maximum_ framerate that will be captured from the video.
            Excess frames will be dropped, i.e., if `fps_cap` is 30 for a video
            with a 60 fps framerate, every other frame will be dropped.
    Returns:
        A tuple (frames, seek_distance) where `frames` is a 4-D numpy array
        loaded from the byte array returned by `lintel.loadvid`, and
        `seek_distance` is the number of seconds into `video` that decoding
        started from.
    Note that the random seeking can be turned off.
    Use _sample_frame_sequence_to_4darray in your PyTorch Dataset object, which
    subclasses torch.utils.data.Dataset. Call _sample_frame_sequence_to_4darray
    in __getitem__. This means that for every minibatch, for each example, a
    random keyframe in the video is seeked to and num_frames frames are decoded
    from there. num_frames would normally tend to be small (if you were going
    to use them as input to a 3D ConvNet or optical flow algorithm), e.g., 32
    frames.
    """
    video, seek_distance = lintel.loadvid(
        video,
        should_random_seek=should_random_seek,
        width=dataset.width,
        height=dataset.height,
        num_frames=dataset.num_frames,
        fps_cap=fps_cap)
    video = np.frombuffer(video, dtype=np.uint8)
    video = np.reshape(video,
                       newshape=(dataset.num_frames, dataset.height,
                                 dataset.width, 3))

    return video, seek_distance
Exemplo n.º 8
0
    def __getitem__(self, index):
        index = index % self.total_frames
        # we want bisect_rigtht here so that the first frame in a file gets the
        # file, not the previous file
        file_index = bisect.bisect_right(self.start_index, index)
        frame = index - self.start_index[file_index -
                                         1] if file_index > 0 else index
        filename = self.files[file_index]

        video = self.videos[file_index]
        frames, seek_distance = lintel.loadvid(
            video,
            should_random_seek=True,
            width=self.dataset.width,
            height=self.dataset.height,
            num_frames=self.dataset.num_frames,
            fps_cap=60)
        frames = np.frombuffer(frames, dtype=np.uint8)
        frames = np.reshape(frames,
                            newshape=(self.dataset.num_frames,
                                      self.dataset.height, self.dataset.width,
                                      3))

        for i in range(self.frames):
            #TODO(jbarker): Tidy this up and remove redundant computation
            if i == 0 and self.is_cropped:
                crop_x = random.randint(
                    0, self.image_shape[1] - self.frame_size[1])
                crop_y = random.randint(
                    0, self.image_shape[0] - self.frame_size[0])
            elif self.is_cropped == False:
                crop_x = math.floor(
                    (self.image_shape[1] - self.frame_size[1]) / 2)
                crop_y = math.floor(
                    (self.image_shape[0] - self.frame_size[0]) / 2)
                self.crop_size = self.frame_size

            image = frames[i, crop_y:crop_y + self.crop_size[0],
                           crop_x:crop_x + self.crop_size[1], :]

            self.frame_buffer[:, i, :, :] = np.rollaxis(image, 2, 0)

        return torch.from_numpy(self.frame_buffer)
Exemplo n.º 9
0
    def __getitem__(self, index):
        vid = self.vids[index]  # get video at correct index
        classification = self.data[vid]  # get label from correct vid

        if not os.path.exists(os.path.join(
                self.root, vid)):  # if vid cannot be found as it's flow?
            if self.mode == 'flow' and self.model == '2d':
                return np.zeros(
                    (3, 20, self.size, self.size),
                    dtype=np.float32), 0  # return zeros if not found
            elif self.mode == 'flow' and self.model == '3d':
                return np.zeros(
                    (2, self.length, self.size, self.size),
                    dtype=np.float32), 0  # return zeros if not found

        # open path now video must exist
        with open(os.path.join(self.root, vid), 'rb') as f:
            enc_vid = f.read()  # read file binary
            # binary data = raw data => read file raw data

        # loading vid into lintel
        # obtaining dataframe width and height of video
        # df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random, num_frames=self.length*2)
        df, w, h, _ = lintel.loadvid(enc_vid, should_random_seek=self.random)

        # interpret buffer as 1 dimensional array
        df = np.frombuffer(df, dtype=np.uint8)  # unsigned 8 bit integer

        # if w < 128 or h < 128 or h > 512 or w > 512: # if video too big or too small
        #     # crop df to 128x128
        #     df = np.zeros(
        #                 (self.length*2, # number of frames?
        #                 128, # height
        #                 128, # width
        #                 3), # colour channels?
        #                 dtype=np.uint8
        #     )

        #     w = h = 128
        #     classification = 0 # set classification to 0 if video is cropped

        # center crop
        # applying random croppings -> different each time video is loaded

        print(df.shape[0] / (h * w * 3))
        print("l", self.length * 2)
        if not self.random:
            i = int(round((h - self.size) / 2.))
            j = int(round((w - self.size) / 2.))
            df = np.reshape(df, newshape=(self.length * 2, h, w, 3))[::2, i:-i,
                                                                     j:-j, :]
        else:
            th = self.size
            tw = self.size
            #print(h, th, h-th)
            i = random.randint(0, h - th) if h != th else 0
            j = random.randint(0, w - tw) if w != tw else 0
            df = np.reshape(df,
                            newshape=(self.length * 2, h, w, 3))[::2, i:i + th,
                                                                 j:j + tw, :]

            # randomly flip
            if random.random() < 0.5:
                df = np.flip(df, axis=2).copy()

        if self.mode == 'flow':
            #print(df[:,:,:,1:].mean())
            #exit()
            # only take the 2 channels corresponding to flow (x,y)
            # t+1, ..., t
            df = df[:, :, :, 1:]
            if self.model == '2d':
                # this should be redone...
                # stack 10 along channel axis
                df = np.asarray([df[:10], df[2:12],
                                 df[4:14]])  # gives 3x10xHxWx2
                df = df.transpose(0, 1, 4, 2,
                                  3).reshape(3, 20, self.size,
                                             self.size).transpose(0, 2, 3, 1)

        # normalise data
        df = 1 - 2 * (df.astype(np.float32) / 255)

        if self.model == '2d':
            # 2d -> return TxCxHxW
            return df.transpose([0, 3, 1, 2]), classification

        # 3d -> return CxTxHxW
        return df.transpose([3, 0, 1, 2]), classification
Exemplo n.º 10
0
    def __getitem__(self, index):
        vid, cls = self.data[index]
        #print(vid)

        with open(vid, 'rb') as f:
            enc_vid = f.read()
#        test = r"/home/qinz/representation-flow-cvpr19/HMDB/pour/Spectacular_tea_pouring_pour_u_cm_np1_ri_med_0.avi"
#        with open(test, 'rb') as f2:
#            enc_vid = f2.read()

        df, w, h, _ = lintel.loadvid(enc_vid,
                                     should_random_seek=self.random,
                                     num_frames=self.length * 2)
        #        print("w, h: ", vid, w, h)
        df = np.frombuffer(df, dtype=np.uint8)
        #        print("df_pre: ", df.shape)

        reduce_flag = False

        if w >= 2 * self.size and h >= 2 * self.size:
            w = w // 2
            h = h // 2
            reduce_flag = True

        # center crop
        if not self.random:
            #            print(w, h)
            i = int(round((h - self.size) / 2.))
            j = int(round((w - self.size) / 2.))
            #            print("i,j" , i, j)
            df = np.reshape(df, newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:-i,
                                                                j:-j, :]
        else:
            th = self.size
            tw = self.size
            #            print(vid, h - th)
            i = random.randint(0, h - th) if h != th else 0
            j = random.randint(0, w - tw) if w != tw else 0
            if reduce_flag == True:
                df = np.reshape(df,
                                newshape=(self.length * 2, h * 2, w * 2,
                                          3))[::2, ::2, ::2, :][:, i:i + th,
                                                                j:j + tw, :]
            else:
                df = np.reshape(df, newshape=(self.length * 2, h, w,
                                              3))[::2, :, :, :][:, i:i + th,
                                                                j:j + tw, :]


#            print("df_size: ", df.shape)

        if self.mode == 'flow':
            #print(df[:,:,:,1:].mean())
            #exit()
            # only take the 2 channels corresponding to flow (x,y)
            df = df[:, :, :, 1:]
            if self.model == '2d':
                # this should be redone...
                # stack 10 along channel axis
                df = np.asarray([df[:10], df[2:12],
                                 df[4:14]])  # gives 3x10xHxWx2
                df = df.transpose(0, 1, 4, 2,
                                  3).reshape(3, 20, self.size,
                                             self.size).transpose(0, 2, 3, 1)

        df = 1 - 2 * (df.astype(np.float32) / 255)

        if self.model == '2d':
            # 2d -> return TxCxHxW
            return df.transpose([0, 3, 1, 2]), cls
        # 3d -> return CxTxHxW
        return df.transpose([3, 0, 1, 2]), cls