Пример #1
0
def retrieve_action_pair_clips_dict(mode='train'):
    '''
    Similar to retrieve_action_pair_clips(), but does not condition on actions,
    and returns every available pair in the dataset organized by dictionary keys instead.
    '''

    # Format with participant_id, video_id, frame_number
    video_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg'
    gulp_subfolder = ('rgb_train'
                      if mode == 'train' or mode == 'val' else 'rgb_' + mode)
    gulp_path = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp/' + gulp_subfolder
    epic_inst = list(EpicVideoDataset(gulp_path, 'verb+noun'))
    epic_inst.sort(
        key=(lambda k: k.video_id + '_{:010d}'.format(k.start_frame)))
    # maps (verb_A, noun_A, verb_B, noun_B) to list of clip metadata
    result = defaultdict(list)

    # Loop over all pairs of action segments
    for i in range(len(epic_inst) - 1):
        segment_A = epic_inst[i]
        segment_B = epic_inst[i + 1]
        # Proceed only if same video
        if segment_A.video_id != segment_B.video_id:
            continue
        # Set action pair as key
        cur_key = (segment_A.verb_class, segment_A.noun_class,
                   segment_B.verb_class, segment_B.noun_class)
        # Append clip
        cur_item = (segment_A.video_id, segment_A.start_frame,
                    segment_A.num_frames, segment_B.start_frame -
                    (segment_A.start_frame + segment_A.num_frames),
                    segment_B.num_frames)
        result[cur_key].append(cur_item)

    return result
Пример #2
0
    def __init__(self,
                 mode='train',
                 transform=None,
                 seq_len=5,
                 num_seq=8,
                 downsample=6,
                 class_type='verb+noun',
                 drive='ssd'):
        print(
            '-- WARNING! -- using obsolete dataset class, see utils/dataset_epic.py and dataset_other.py instead'
        )
        self.mode = mode
        self.transform = transform
        self.seq_len = seq_len
        self.num_seq = num_seq
        self.downsample = downsample
        self.class_type = class_type
        if drive == 'ssd':
            gulp_root = '/local/vondrick/epic-kitchens/gulp'
        else:
            gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp'

        print(os.path.join(gulp_root, 'rgb_train', self.class_type))
        self.EpicDataset = EpicVideoDataset(
            os.path.join(gulp_root, 'rgb_train'), self.class_type)
        dataset = list(self.EpicDataset)
        rgb = []
        for i in range(len(dataset)):
            # remove segments that are too short
            if dataset[
                    i].num_frames > self.seq_len * self.num_seq * self.downsample:
                rgb.append(dataset[i])
        del dataset
        train_idx = random.sample(range(1, len(rgb)),
                                  int(float(len(rgb)) * 0.8))
        rgb_train = []
        rgb_val = []
        for i in range(len(rgb)):
            if i in train_idx:
                rgb_train.append(rgb[i])
            else:
                rgb_val.append(rgb[i])
        if self.mode == 'train':
            self.video_info = rgb_train
        elif self.mode in ['val']:
            self.video_info = rgb_val
def gen_label(gulp_dir, interim_dir, out, split_path):
    with open(split_path, 'r') as f:
        trainval = json.load(f)
    idxsplit = (len(trainval['train']) + len(trainval['val']))*[None]
    for i in trainval['train']:
        idxsplit[i] = 'train'
    for i in trainval['val']:
        idxsplit[i] = 'val'
    assert None not in idxsplit

    action_classes = {}
    class_counts = {}
    next_action_class = 0
    rgbviddata = EpicVideoDataset(f'{gulp_dir}/rgb_train', 'verb+noun')
    outputs = {'train': [], 'val': []}
    categories = []
    for i, seg in enumerate(rgbviddata.video_segments):
        parid = seg['participant_id']
        vidid = seg['video_id']
        nar = seg['narration'].replace(' ', '-')
        uid = seg['uid']
        reldir = f'{parid}/{vidid}/{vidid}_{uid}_{nar}'
        assert os.path.exists(f'{interim_dir}/{reldir}'), f'{interim_dir}/{reldir}'

        verb = seg['verb_class']
        noun = seg['noun_class']
        action = f'{verb},{noun}'
        if action in action_classes:
            classidx = action_classes[action]
            class_counts[action] += 1
        else:
            categories.append(f'{seg["verb"]} {seg["noun"]}')
            classidx = next_action_class
            action_classes[action] = classidx
            class_counts[action] = 1
            next_action_class += 1

        nframes = seg['num_frames']
        outputs[idxsplit[i]].append(f'{reldir} {nframes} {classidx}')

    assert len(set(categories)) == len(categories)

    with open(f'{out}/category.txt', 'w') as f:
        f.write('\n'.join(categories))

    with open(f'{out}/train_videofolder.txt', 'w') as f:
        f.write('\n'.join(outputs['train']))

    with open(f'{out}/val_videofolder.txt', 'w') as f:
        f.write('\n'.join(outputs['val']))

    class_counts = list(class_counts.values())
    class_counts.sort()
    plt.bar(range(0, len(class_counts)), class_counts)
    plt.savefig('action_class_histogram.png')
Пример #4
0
    def __init__(self,
                 mode='train',
                 transform=None,
                 seq_len=6,
                 num_seq=5,
                 downsample=3,
                 class_type='verb+noun'):
        self.mode = mode
        self.transform = transform
        self.seq_len = seq_len
        self.num_seq = num_seq
        self.downsample = downsample
        self.class_type = class_type
        gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp'

        print(os.path.join(gulp_root, 'rgb_train', self.class_type))
        self.EpicDataset = EpicVideoDataset(
            os.path.join(gulp_root, 'rgb_train'), self.class_type)
        dataset = list(self.EpicDataset)
        rgb = []
        for i in range(len(dataset)):
            # remove segments that are too short
            if dataset[
                    i].num_frames > self.seq_len * self.num_seq * self.downsample:
                rgb.append(dataset[i])
        del dataset
        train_idx = random.sample(range(1, len(rgb)),
                                  int(float(len(rgb)) * 0.8))
        rgb_train = []
        rgb_val = []
        for i in range(len(rgb)):
            if i in train_idx:
                rgb_train.append(rgb[i])
            else:
                rgb_val.append(rgb[i])
        if self.mode == 'train':
            self.video_info = rgb_train
        elif self.mode in ['val']:
            self.video_info = rgb_val
Пример #5
0
def retrieve_action_pair_clips(verb_A, noun_A, verb_B, noun_B, mode='train'):
    '''
    Returns a list of all identifying metadata for Epic video clip sequences
    that depict an ordered sequence of actions "verb_A noun_A" -> "verb_B noun_B".
    Arguments can be either class indices or strings.
    Here, every item is: (video_id, start_frame, frames_A, frames_gap, frames_B),
    for example: ('P12_08', 1234, 200, 100, 200) where total video clip frames is 500.
    mode: train / val / test.
    WARNING: same gulp folder is used for train & val, i.e. split is NOT made here!
    '''

    # Format with participant_id, video_id, frame_number
    video_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg'
    gulp_subfolder = ('rgb_train'
                      if mode == 'train' or mode == 'val' else 'rgb_' + mode)
    gulp_path = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp/' + gulp_subfolder
    epic_inst = list(EpicVideoDataset(gulp_path, 'verb+noun'))
    epic_inst.sort(
        key=(lambda k: k.video_id + '_{:010d}'.format(k.start_frame)))
    result = []

    # Loop over all pairs of action segments
    for i in range(len(epic_inst) - 1):
        segment_A = epic_inst[i]
        segment_B = epic_inst[i + 1]
        # Proceed only if same video
        if segment_A.video_id != segment_B.video_id:
            continue
        # Condition on first action
        if not(verb_A in [segment_A.verb, segment_A.verb_class]) or \
                not(noun_A in [segment_A.noun, segment_A.noun_class]):
            continue
        # Condition on second action
        if not(verb_B in [segment_B.verb, segment_B.verb_class]) or \
                not(noun_B in [segment_B.noun, segment_B.noun_class]):
            continue
        # Append clip
        cur_item = (segment_A.video_id, segment_A.start_frame,
                    segment_A.num_frames, segment_B.start_frame -
                    (segment_A.start_frame + segment_A.num_frames),
                    segment_B.num_frames)
        result.append(cur_item)

    return result
Пример #6
0
from detectron2 import model_zoo
from detectron2.engine.defaults import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog
from detectron2.structures import BoxMode

from gulpio import GulpDirectory
from epic_kitchens.dataset.epic_dataset import EpicVideoDataset
from gulpio.transforms import Scale, CenterCrop, Compose, UnitNorm

from read_gulpio import EpicDataset

class_type = 'noun'
rgb_train = EpicVideoDataset('../../epic/data/processed/gulp/rgb_train',
                             class_type)
transforms = Compose([])
dataset = EpicDataset(transforms)
segment_uids = list(rgb_train.gulp_dir.merged_meta_dict.keys())
exsample_segment = rgb_train.video_segments[10]
exsample_frames = rgb_train.load_frames(exsample_segment)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

for batch_num, (data, label) in enumerate(dataloader):
    frame = data[0].to('cpu').detach().numpy().copy()
    frame = frame.transpose(1, 2, 3, 0)
    frame = np.squeeze(frame)
    break
im = frame
cfg = get_cfg()
Пример #7
0
class epic_gulp(data.Dataset):
    def __init__(self,
                 mode='train',
                 transform=None,
                 seq_len=6,
                 num_seq=5,
                 downsample=3,
                 class_type='verb+noun'):
        self.mode = mode
        self.transform = transform
        self.seq_len = seq_len
        self.num_seq = num_seq
        self.downsample = downsample
        self.class_type = class_type
        gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp'

        print(os.path.join(gulp_root, 'rgb_train', self.class_type))
        self.EpicDataset = EpicVideoDataset(
            os.path.join(gulp_root, 'rgb_train'), self.class_type)
        dataset = list(self.EpicDataset)
        rgb = []
        for i in range(len(dataset)):
            # remove segments that are too short
            if dataset[
                    i].num_frames > self.seq_len * self.num_seq * self.downsample:
                rgb.append(dataset[i])
        del dataset
        train_idx = random.sample(range(1, len(rgb)),
                                  int(float(len(rgb)) * 0.8))
        rgb_train = []
        rgb_val = []
        for i in range(len(rgb)):
            if i in train_idx:
                rgb_train.append(rgb[i])
            else:
                rgb_val.append(rgb[i])
        if self.mode == 'train':
            self.video_info = rgb_train
        elif self.mode in ['val']:
            self.video_info = rgb_val

    def idx_sampler(self, index):
        vlen = self.video_info[index].num_frames
        if vlen - self.num_seq * self.seq_len * self.downsample <= 0:
            return None
        n = 1
        start_idx = np.random.choice(
            range(vlen - self.num_seq * self.seq_len * self.downsample), n)
        #             print ("start_idx:", start_idx)
        seq_idx = np.expand_dims(np.arange(
            self.num_seq), -1) * self.downsample * self.seq_len + start_idx
        #             print ("seq_idx:", seq_idx)
        seq_idx_block = seq_idx + \
            np.expand_dims(np.arange(self.seq_len), 0) * self.downsample
        #             print ("seq_idx_block:", seq_idx_block)
        return seq_idx_block

    def __getitem__(self, index):

        idx_block = self.idx_sampler(index)
        #             print(idx_block)
        #             print(index)
        #             print(len(self.video_info))
        assert idx_block.shape == (self.num_seq, self.seq_len)
        idx_block = idx_block.reshape(self.num_seq * self.seq_len)

        #print ("idx_block, ", idx_block)
        segment = self.EpicDataset.load_frames(self.video_info[index])
        seq = [segment[i] for i in idx_block]

        # do we need it here
        t_seq = self.transform(seq)  # apply same transform
        num_crop = None
        try:
            (C, H, W) = t_seq[0].size()
            t_seq = torch.stack(t_seq, 0)
        except:
            (C, H, W) = t_seq[0][0].size()
            tmp = [torch.stack(i, 0) for i in t_seq]
            assert len(tmp) == 5
            num_crop = 5
            t_seq = torch.stack(tmp, 1)
        t_seq = t_seq.view(self.num_seq, self.seq_len, C, H, W).transpose(1, 2)

        action = torch.LongTensor([self.video_info[index].verb_class])
        noun = torch.LongTensor([self.video_info[index].noun_class])

        # OLD: return sequence only
        # return t_seq, action, noun
        # NEW: return all useful information in a dictionary
        result = {
            't_seq': t_seq,
            'idx_block': idx_block,
            'vpath': 'TODO, idk how to retrieve',
            'action': action,
            'noun': noun
        }
        return result

    def __len__(self):
        return len(self.video_info)
def main(conf, test_set, test_part=-1):
    gulp_path = os.path.join(conf.gulp_test_dir, conf.modality.lower(), 'test',
                             test_set)
    gulp_path = os.path.realpath(gulp_path)
    gulp_path = Path(gulp_path)

    classes_map = pickle.load(open(conf.classes_map, "rb"))
    conf.num_classes = count_num_classes(classes_map)

    net = TSN(conf.num_classes,
              1,
              conf.modality,
              base_model=conf.arch,
              consensus_type=conf.crop_fusion_type,
              dropout=conf.dropout)

    checkpoint = torch.load(conf.weights)
    print("Model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))

    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict)

    if conf.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.input_size),
        ])
    elif conf.test_crops == 10:
        cropping = torchvision.transforms.Compose(
            [GroupOverSample(net.input_size, net.scale_size)])
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported while we got {}".format(
                conf.test_crops))

    class_type = 'verb+noun' if conf.class_type == 'action' else conf.class_type
    if conf.modality == 'Flow':
        dataset = EpicVideoFlowDataset(gulp_path=gulp_path,
                                       class_type=class_type)
    else:
        dataset = EpicVideoDataset(gulp_path=gulp_path, class_type=class_type)

    data_loader = torch.utils.data.DataLoader(EpicTSNTestDataset(
        dataset,
        classes_map,
        num_segments=conf.test_segments,
        new_length=1 if conf.modality == "RGB" else 5,
        modality=conf.modality,
        transform=torchvision.transforms.Compose([
            cropping,
            Stack(roll=conf.arch == 'BNInception'),
            ToTorchFormatTensor(div=conf.arch != 'BNInception'),
            GroupNormalize(net.input_mean, net.input_std),
        ]),
        part=test_part),
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=conf.workers * 2,
                                              pin_memory=True)

    net = torch.nn.DataParallel(net, device_ids=conf.gpus).cuda()
    net.eval()

    total_num = len(data_loader.dataset)
    output = []

    proc_start_time = time.time()
    for i, (keys, input_) in enumerate(data_loader):
        rst = eval_video(conf, (i, keys, input_), net)
        output.append(rst[1:])
        cnt_time = time.time() - proc_start_time
        print('video {} done, total {}/{}, average {} sec/video'.format(
            i, i + 1, total_num,
            float(cnt_time) / (i + 1)))

    video_index = [x[0] for x in output]
    scores = [x[1] for x in output]

    save_scores = './{}/tsn_{}_{}_testset_{}_{}_lr_{}_model_{:03d}.npz'.format(
        conf.checkpoint, conf.class_type, conf.modality.lower(), test_set,
        conf.arch, conf.lr, checkpoint['epoch'])
    if test_part > 0:
        save_scores = save_scores.replace('.npz',
                                          '_part-{}.npz'.format(test_part))
    np.savez(save_scores, segment_indices=video_index, scores=scores)
Пример #9
0
    def __init__(self,
                 mode='train',
                 transform=None,
                 seq_len=5,
                 num_seq=8,
                 downsample=6,
                 class_type='both',
                 train_val_split=0.2,
                 verb_subset=None,
                 noun_subset=None,
                 participant_subset=None,
                 drive='ssd',
                 sample_method='within',
                 sample_offset=0,
                 label_fraction=1.0):
        '''
        mode: train / val / test_seen / test_unseen.
        seq_len: Number of frames in a video block.
        num_seq: Number of video blocks in a clip / sequence.
        downsample: Temporal sampling rate of frames. The effective new FPS becomes old FPS / downsample.
        class_type: verb / noun / both.
        label_fraction: < 1.0 means use fewer labels for training.

        verb_subset: List of verb strings to condition on, if not None.
        noun_subset: List of noun strings to condition on, if not None.
        participant_subset: List of participants to condition on, if not None.
        Examples: verb_subset = ['take', 'open', 'close'], participant_subset = ['P16', 'P11'].

        sample_method: within / match_start / match_end / before
          - within = uniformly randomly sample sequence fully within an action label segment (e.g. for action classification)
          - match_start = START of sequence matches START of action label segment
          - match_end = END of sequence matches END of action label segment (e.g. for future uncertainty ranking)
          - before = END of sequence matches START of action label segment (e.g. for action anticipation)
        (NOTE: 'within' discards too short segments, all other methods do not.)

        sample_offset: Number of video blocks to shift sequence sampling by.
        Example 1: if (sample_method == 'match_start', sample_offset == -2),
        then video sequence starts already 2 blocks before the current action starts.
        Example 2: if (sample_method == 'match_end', sample_offset == 3, pred_step == 3),
        then all warmup video blocks represent the current action in progress, but all predicted blocks represent another, unknown action.
        Example 3: if (sample_method == 'before', sample_offset == 1, pred_step == 3),
        then only the LAST predicted block represents the current action, all preceding blocks represent something else.
        '''
        if not (class_type in ['verb', 'noun']):
            class_type = 'verb+noun'
            # print('=> class_type is now set to both a.k.a. verb+noun')

        self.mode = mode
        self.transform = transform
        self.seq_len = seq_len
        self.num_seq = num_seq
        self.downsample = downsample
        self.block_frames = seq_len * downsample  # number of frames in one video block
        # number of frames in one complete sequence
        self.seq_frames = seq_len * num_seq * downsample
        self.class_type = class_type
        self.train_val_split = train_val_split
        self.verb_subset = verb_subset
        self.noun_subset = noun_subset
        self.participant_subset = participant_subset
        self.drive = drive
        self.sample_method = sample_method
        self.sample_offset = sample_offset
        self.label_fraction = label_fraction

        # Verify arguments
        if not (mode in ['train', 'val', 'test_seen', 'test_unseen']):
            raise ValueError('Unknown dataset mode: ' + mode)

        # Specify paths, both gulp (= main) and jpg (= backup)
        # JPG path format arguments:
        if drive == 'ssd':
            gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp'
            self.jpg_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg'
        else:
            print('== WARNING! == using HDD instead of SSD')
            gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp'
            self.jpg_path = '/proj/vondrick/datasets/epic-kitchens/data/raw/rgb/{}/{}/frame_{:010d}.jpg'

        # Load video info (RGB frames)
        subfolder = ('rgb_train'
                     if mode == 'train' or mode == 'val' else 'rgb_' + mode)
        full_path = os.path.join(gulp_root, subfolder)
        print('Selected dataset:', full_path, self.class_type)
        self.epic_dataset_inst = EpicVideoDataset(full_path, self.class_type)

        # Split dataset randomly into train & validation with fixed seed
        # NOTE: this split will be different for other values of train_val_split
        dataset = list(self.epic_dataset_inst)
        if mode in ['train', 'val']:
            rand_state = random.getstate()
            random.seed(8888)
            train_list = random.sample(
                dataset,
                int(len(dataset) *
                    (1 - self.train_val_split)))  # without replacement
            random.setstate(rand_state)  # retain & restore random state

            if label_fraction < 1.0:
                print(
                    '== WARNING! == using just a fraction of available labels for training: '
                    + str(label_fraction * 100) + '%')
                used_train_len = int(label_fraction * len(train_list))
                train_list = train_list[:
                                        used_train_len]  # deterministic operation because of fixed seed above

            if mode == 'train':
                dataset = train_list
            elif mode == 'val':
                train_set = set(train_list)
                val_list = []
                for item in dataset:
                    if item not in train_set:
                        val_list.append(item)
                dataset = val_list

        # Loop over segments in epic dataset and filter out videos
        rgb = []
        for i in range(len(dataset)):
            # If within, retain only sufficiently long video clips
            if sample_method == 'within' and dataset[
                    i].num_frames <= self.seq_frames:
                continue
            # Condition on verbs
            if verb_subset is not None and not (dataset[i].verb
                                                in verb_subset):
                continue
            # Condition on nouns
            if noun_subset is not None and not (dataset[i].noun
                                                in noun_subset):
                continue
            # Condition on participants
            if participant_subset is not None and not (
                    dataset[i].participant_id in participant_subset):
                continue
            rgb.append(dataset[i])

        self.video_info = rgb
        del dataset