Exemplo n.º 1
0
    def __getitem__(self, indices):
        video_ids, captions, starts, ends, vid_stacks_rgb, vid_stacks_flow = [], [], [], [], [], []

        for idx in indices:
            idx = idx.item()
            video_id, caption, start, end, duration, _, _ = self.dataset.iloc[
                idx]

            stack = load_features_from_npy(self.cfg, self.feature_names_list,
                                           video_id, start, end, duration,
                                           self.pad_idx, self.get_full_feat)

            vid_stack_rgb, vid_stack_flow = stack['rgb'], stack['flow']

            # either both None or both are not None (Boolean Equivalence)
            both_are_None = vid_stack_rgb is None and vid_stack_flow is None
            none_is_None = vid_stack_rgb is not None and vid_stack_flow is not None
            assert both_are_None or none_is_None

            # # sometimes stack is empty after the filtering. we replace it with noise
            if both_are_None:
                # print(f'RGB and FLOW are None. Zero (1, D) @: {video_id}')
                vid_stack_rgb = fill_missing_features('zero',
                                                      self.feature_size)
                vid_stack_flow = fill_missing_features('zero',
                                                       self.feature_size)

            # append info for this index to the lists
            video_ids.append(video_id)
            captions.append(caption)
            starts.append(start)
            ends.append(end)
            vid_stacks_rgb.append(vid_stack_rgb)
            vid_stacks_flow.append(vid_stack_flow)

        vid_stacks_rgb = pad_sequence(vid_stacks_rgb,
                                      batch_first=True,
                                      padding_value=self.pad_idx)
        vid_stacks_flow = pad_sequence(vid_stacks_flow,
                                       batch_first=True,
                                       padding_value=0)

        starts = torch.tensor(starts).unsqueeze(1)
        ends = torch.tensor(ends).unsqueeze(1)

        batch_dict = {
            'video_ids': video_ids,
            'captions': captions,
            'starts': starts.to(self.device),
            'ends': ends.to(self.device),
            'feature_stacks': {
                'rgb': vid_stacks_rgb.to(self.device),
                'flow': vid_stacks_flow.to(self.device),
            }
        }

        return batch_dict
Exemplo n.º 2
0
    def __getitem__(self, indices):
        video_ids, captions, starts, ends, aud_stacks = [], [], [], [], []

        # [3]
        for idx in indices:
            idx = idx.item()
            video_id, caption, start, end, duration, _, _ = self.dataset.iloc[
                idx]

            stack = load_features_from_npy(self.cfg, self.feature_names_list,
                                           video_id, start, end, duration,
                                           self.pad_idx, self.get_full_feat)
            aud_stack = stack['audio']

            # sometimes stack is empty after the filtering. we replace it with noise
            if aud_stack is None:
                # print(f'VGGish is None. Zero (1, D) @: {video_id}')
                aud_stack = fill_missing_features('zero', self.feature_size)

            # append info for this index to the lists
            video_ids.append(video_id)
            captions.append(caption)
            starts.append(start)
            ends.append(end)
            aud_stacks.append(aud_stack)

        # [4] see ActivityNetCaptionsDataset.__getitem__ documentation
        aud_stacks = pad_sequence(aud_stacks,
                                  batch_first=True,
                                  padding_value=self.pad_idx)

        starts = torch.tensor(starts).unsqueeze(1)
        ends = torch.tensor(ends).unsqueeze(1)

        batch_dict = {
            'video_ids': video_ids,
            'captions': captions,
            'starts': starts.to(self.device),
            'ends': ends.to(self.device),
            'feature_stacks': {
                'audio': aud_stacks.to(self.device),
            }
        }

        return batch_dict
Exemplo n.º 3
0
    def __getitem__(self, indices):
        video_ids, captions, starts, ends = [], [], [], []
        vid_stacks_rgb, vid_stacks_flow, aud_stacks = [], [], []

        # [3]
        for idx in indices:
            idx = idx.item()
            video_id, caption, start, end, duration, _, _ = self.dataset.iloc[
                idx]

            stack = load_features_from_npy(self.cfg, self.feature_names_list,
                                           video_id, start, end, duration,
                                           self.pad_idx, self.get_full_feat)
            vid_stack_rgb, vid_stack_flow, aud_stack = stack['rgb'], stack[
                'flow'], stack['audio']

            # either both None or both are not None (Boolean Equivalence)
            both_are_None = vid_stack_rgb is None and vid_stack_flow is None
            none_is_None = vid_stack_rgb is not None and vid_stack_flow is not None
            assert both_are_None or none_is_None

            # sometimes vid_stack and aud_stack are empty after the filtering.
            # we replace it with noise.
            # tied with assertion above
            if (vid_stack_rgb is None) and (vid_stack_flow is None):
                # print(f'RGB and FLOW are None. Zero (1, D) @: {video_id}')
                vid_stack_rgb = fill_missing_features('zero',
                                                      self.video_feature_size)
                vid_stack_flow = fill_missing_features('zero',
                                                       self.video_feature_size)
            if aud_stack is None:
                # print(f'Audio is None. Zero (1, D) @: {video_id}')
                aud_stack = fill_missing_features('zero',
                                                  self.audio_feature_size)

            # append info for this index to the lists
            video_ids.append(video_id)
            captions.append(caption)
            starts.append(start)
            ends.append(end)
            vid_stacks_rgb.append(vid_stack_rgb)
            vid_stacks_flow.append(vid_stack_flow)
            aud_stacks.append(aud_stack)

        # [4] see ActivityNetCaptionsDataset.__getitem__ documentation
        # rgb is padded with pad_idx; flow is padded with 0s: expected to be summed later
        vid_stacks_rgb = pad_sequence(vid_stacks_rgb,
                                      batch_first=True,
                                      padding_value=self.pad_idx)
        vid_stacks_flow = pad_sequence(vid_stacks_flow,
                                       batch_first=True,
                                       padding_value=0)
        aud_stacks = pad_sequence(aud_stacks,
                                  batch_first=True,
                                  padding_value=self.pad_idx)

        starts = torch.tensor(starts).unsqueeze(1)
        ends = torch.tensor(ends).unsqueeze(1)

        batch_dict = {
            'video_ids': video_ids,
            'captions': captions,
            'starts': starts.to(self.device),
            'ends': ends.to(self.device),
            'feature_stacks': {
                'rgb': vid_stacks_rgb.to(self.device),
                'flow': vid_stacks_flow.to(self.device),
                'audio': aud_stacks.to(self.device),
            }
        }

        return batch_dict
Exemplo n.º 4
0
 def get_feature_stacks(self, video_id):
     feature_stacks = load_features_from_npy(
         self.cfg, self.feature_names_list, video_id,
         start=None, end=None, duration=None, pad_idx=self.pad_idx, get_full_feat=True
     )
     return feature_stacks