def _make_serialized_tf_sequence_example(self):
     num_frames = 4
     image_height = 20
     image_width = 30
     image_source_ids = [str(i) for i in range(num_frames)]
     encoded_images = self._make_random_serialized_jpeg_images(
         num_frames, image_height, image_width)
     sequence_example_serialized = seq_example_util.make_sequence_example(
         dataset_name='video_dataset',
         video_id='video',
         encoded_images=encoded_images,
         image_height=image_height,
         image_width=image_width,
         image_source_ids=image_source_ids,
         image_format='JPEG',
         is_annotated=[[1], [1], [1], [1]],
         bboxes=[
             [[]],  # Frame 0.
             [[0., 0., 1., 1.]],  # Frame 1.
             [[0., 0., 1., 1.], [0.1, 0.1, 0.2, 0.2]],  # Frame 2.
             [[]],  # Frame 3.
         ],
         label_strings=[
             [],  # Frame 0.
             ['Abyssinian'],  # Frame 1.
             ['Abyssinian', 'american_bulldog'],  # Frame 2.
             [],  # Frame 3
         ]).SerializeToString()
     return sequence_example_serialized
Пример #2
0
    def create_tf_record_sequence_example(self):
        path = os.path.join(self.get_temp_dir(), 'seq_tfrecord')
        writer = tf.python_io.TFRecordWriter(path)

        num_frames = 4
        image_height = 4
        image_width = 5
        image_source_ids = [str(i) for i in range(num_frames)]
        with self.test_session():
            encoded_images = self._make_random_serialized_jpeg_images(
                num_frames, image_height, image_width)
            sequence_example_serialized = seq_example_util.make_sequence_example(
                dataset_name='video_dataset',
                video_id='video',
                encoded_images=encoded_images,
                image_height=image_height,
                image_width=image_width,
                image_source_ids=image_source_ids,
                image_format='JPEG',
                is_annotated=[[1], [1], [1], [1]],
                bboxes=[
                    [[]],  # Frame 0.
                    [[0., 0., 1., 1.]],  # Frame 1.
                    [[0., 0., 1., 1.], [0.1, 0.1, 0.2, 0.2]],  # Frame 2.
                    [[]],  # Frame 3.
                ],
                label_strings=[
                    [],  # Frame 0.
                    ['Abyssinian'],  # Frame 1.
                    ['Abyssinian', 'american_bulldog'],  # Frame 2.
                    [],  # Frame 3
                ]).SerializeToString()
            writer.write(sequence_example_serialized)
            writer.close()
        return path
    def graph_fn():
      label_map_proto_file = os.path.join(self.get_temp_dir(), 'labelmap.pbtxt')
      self._create_label_map(label_map_proto_file)
      decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder(
          label_map_proto_file=label_map_proto_file)
      sequence_example_serialized = seq_example_util.make_sequence_example(
          dataset_name='video_dataset',
          video_id='video',
          encoded_images=encoded_images,
          image_height=image_height,
          image_width=image_width,
          image_format='JPEG',
          image_source_ids=[str(i) for i in range(num_frames)],
          is_annotated=[[1], [1], [1], [1]],
          bboxes=[
              [[0., 0., 1., 1.]],  # Frame 0.
              [[0.2, 0.2, 1., 1.],
               [0., 0., 1., 1.]],  # Frame 1.
              [[0., 0., 1., 1.],  # Frame 2.
               [0.1, 0.1, 0.2, 0.2]],
              [[]],  # Frame 3.
          ],
          label_strings=[
              ['fox'],  # Frame 0. Fox will be filtered out.
              ['fox', 'dog'],  # Frame 1. Fox will be filtered out.
              ['dog', 'cat'],  # Frame 2.
              [],  # Frame 3
          ]).SerializeToString()

      example_string_tensor = tf.convert_to_tensor(sequence_example_serialized)
      return decoder.decode(example_string_tensor)
    def graph_fn():
      sequence_example_serialized = seq_example_util.make_sequence_example(
          dataset_name='video_dataset',
          video_id='video',
          encoded_images=encoded_images,
          image_height=image_height,
          image_width=image_width,
          image_format='JPEG',
          image_source_ids=[str(i) for i in range(num_frames)],
          bboxes=[
              [[]],
              [[]],
              [[]],
              [[]]
          ],
          label_strings=[
              [],
              [],
              [],
              []
          ]).SerializeToString()
      example_string_tensor = tf.convert_to_tensor(sequence_example_serialized)

      label_map_proto_file = os.path.join(self.get_temp_dir(), 'labelmap.pbtxt')
      self._create_label_map(label_map_proto_file)
      decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder(
          label_map_proto_file=label_map_proto_file)
      return decoder.decode(example_string_tensor)
  def test_make_unlabeled_example(self):
    num_frames = 5
    image_height = 100
    image_width = 200
    dataset_name = b'unlabeled_dataset'
    video_id = b'video_000'
    images = tf.cast(tf.random.uniform(
        [num_frames, image_height, image_width, 3],
        maxval=256,
        dtype=tf.int32), dtype=tf.uint8)
    image_source_ids = [str(idx) for idx in range(num_frames)]
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
    encoded_images = self.materialize_tensors(encoded_images_list)
    seq_example = seq_example_util.make_sequence_example(
        dataset_name=dataset_name,
        video_id=video_id,
        encoded_images=encoded_images,
        image_height=image_height,
        image_width=image_width,
        image_format='JPEG',
        image_source_ids=image_source_ids)

    context_feature_dict = seq_example.context.feature
    self.assertEqual(
        dataset_name,
        context_feature_dict['example/dataset_name'].bytes_list.value[0])
    self.assertEqual(
        0,
        context_feature_dict['clip/start/timestamp'].int64_list.value[0])
    self.assertEqual(
        num_frames - 1,
        context_feature_dict['clip/end/timestamp'].int64_list.value[0])
    self.assertEqual(
        num_frames,
        context_feature_dict['clip/frames'].int64_list.value[0])
    self.assertEqual(
        3,
        context_feature_dict['image/channels'].int64_list.value[0])
    self.assertEqual(
        b'JPEG',
        context_feature_dict['image/format'].bytes_list.value[0])
    self.assertEqual(
        image_height,
        context_feature_dict['image/height'].int64_list.value[0])
    self.assertEqual(
        image_width,
        context_feature_dict['image/width'].int64_list.value[0])
    self.assertEqual(
        video_id,
        context_feature_dict['clip/media_id'].bytes_list.value[0])

    seq_feature_dict = seq_example.feature_lists.feature_list
    self.assertLen(
        seq_feature_dict['image/encoded'].feature[:],
        num_frames)
    timestamps = [
        feature.int64_list.value[0] for feature
        in seq_feature_dict['image/timestamp'].feature]
    self.assertAllEqual(list(range(num_frames)), timestamps)
    source_ids = [
        feature.bytes_list.value[0] for feature
        in seq_feature_dict['image/source_id'].feature]
    self.assertAllEqual(
        [six.ensure_binary(str(idx)) for idx in range(num_frames)],
        source_ids)
  def test_make_labeled_example_with_predictions(self):
    num_frames = 2
    image_height = 100
    image_width = 200
    dataset_name = b'unlabeled_dataset'
    video_id = b'video_000'
    images = tf.cast(tf.random.uniform(
        [num_frames, image_height, image_width, 3],
        maxval=256,
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
    encoded_images = self.materialize_tensors(encoded_images_list)
    bboxes = [
        np.array([[0., 0., 0.75, 0.75],
                  [0., 0., 1., 1.]], dtype=np.float32),
        np.array([[0., 0.25, 0.5, 0.75]], dtype=np.float32)
    ]
    label_strings = [
        np.array(['cat', 'frog']),
        np.array(['cat'])
    ]
    detection_bboxes = [
        np.array([[0., 0., 0.75, 0.75]], dtype=np.float32),
        np.zeros([0, 4], dtype=np.float32)
    ]
    detection_classes = [
        np.array([5], dtype=np.int64),
        np.array([], dtype=np.int64)
    ]
    detection_scores = [
        np.array([0.9], dtype=np.float32),
        np.array([], dtype=np.float32)
    ]

    seq_example = seq_example_util.make_sequence_example(
        dataset_name=dataset_name,
        video_id=video_id,
        encoded_images=encoded_images,
        image_height=image_height,
        image_width=image_width,
        bboxes=bboxes,
        label_strings=label_strings,
        detection_bboxes=detection_bboxes,
        detection_classes=detection_classes,
        detection_scores=detection_scores)

    context_feature_dict = seq_example.context.feature
    self.assertEqual(
        dataset_name,
        context_feature_dict['example/dataset_name'].bytes_list.value[0])
    self.assertEqual(
        0,
        context_feature_dict['clip/start/timestamp'].int64_list.value[0])
    self.assertEqual(
        1,
        context_feature_dict['clip/end/timestamp'].int64_list.value[0])
    self.assertEqual(
        num_frames,
        context_feature_dict['clip/frames'].int64_list.value[0])

    seq_feature_dict = seq_example.feature_lists.feature_list
    self.assertLen(
        seq_feature_dict['image/encoded'].feature[:],
        num_frames)
    actual_timestamps = [
        feature.int64_list.value[0] for feature
        in seq_feature_dict['image/timestamp'].feature]
    self.assertAllEqual([0, 1], actual_timestamps)
    # Frame 0.
    self.assertAllEqual(
        1,
        seq_feature_dict['region/is_annotated'].feature[0].int64_list.value[0])
    self.assertAllClose(
        [0., 0.],
        seq_feature_dict['region/bbox/ymin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0., 0.],
        seq_feature_dict['region/bbox/xmin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0.75, 1.],
        seq_feature_dict['region/bbox/ymax'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0.75, 1.],
        seq_feature_dict['region/bbox/xmax'].feature[0].float_list.value[:])
    self.assertAllEqual(
        [b'cat', b'frog'],
        seq_feature_dict['region/label/string'].feature[0].bytes_list.value[:])
    self.assertAllClose(
        [0.],
        seq_feature_dict[
            'predicted/region/bbox/ymin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0.],
        seq_feature_dict[
            'predicted/region/bbox/xmin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0.75],
        seq_feature_dict[
            'predicted/region/bbox/ymax'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0.75],
        seq_feature_dict[
            'predicted/region/bbox/xmax'].feature[0].float_list.value[:])
    self.assertAllEqual(
        [5],
        seq_feature_dict[
            'predicted/region/label/index'].feature[0].int64_list.value[:])
    self.assertAllClose(
        [0.9],
        seq_feature_dict[
            'predicted/region/label/confidence'].feature[0].float_list.value[:])

    # Frame 1.
    self.assertAllEqual(
        1,
        seq_feature_dict['region/is_annotated'].feature[1].int64_list.value[0])
    self.assertAllClose(
        [0.0],
        seq_feature_dict['region/bbox/ymin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [0.25],
        seq_feature_dict['region/bbox/xmin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [0.5],
        seq_feature_dict['region/bbox/ymax'].feature[1].float_list.value[:])
    self.assertAllClose(
        [0.75],
        seq_feature_dict['region/bbox/xmax'].feature[1].float_list.value[:])
    self.assertAllEqual(
        [b'cat'],
        seq_feature_dict['region/label/string'].feature[1].bytes_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict[
            'predicted/region/bbox/ymin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict[
            'predicted/region/bbox/xmin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict[
            'predicted/region/bbox/ymax'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict[
            'predicted/region/bbox/xmax'].feature[1].float_list.value[:])
    self.assertAllEqual(
        [],
        seq_feature_dict[
            'predicted/region/label/index'].feature[1].int64_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict[
            'predicted/region/label/confidence'].feature[1].float_list.value[:])
  def test_make_labeled_example_with_context_features(self):
    num_frames = 2
    image_height = 100
    image_width = 200
    dataset_name = b'unlabeled_dataset'
    video_id = b'video_000'
    labels = [b'dog', b'cat']
    images = tf.cast(tf.random.uniform(
        [num_frames, image_height, image_width, 3],
        maxval=256,
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
    encoded_images = self.materialize_tensors(encoded_images_list)
    timestamps = [100000, 110000]
    is_annotated = [1, 0]
    bboxes = [
        np.array([[0., 0., 0., 0.],
                  [0., 0., 1., 1.]], dtype=np.float32),
        np.zeros([0, 4], dtype=np.float32)
    ]
    label_strings = [
        np.array(labels),
        np.array([])
    ]
    context_features = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    context_feature_length = [3]
    context_features_image_id_list = [b'im_1', b'im_2']

    seq_example = seq_example_util.make_sequence_example(
        dataset_name=dataset_name,
        video_id=video_id,
        encoded_images=encoded_images,
        image_height=image_height,
        image_width=image_width,
        timestamps=timestamps,
        is_annotated=is_annotated,
        bboxes=bboxes,
        label_strings=label_strings,
        context_features=context_features,
        context_feature_length=context_feature_length,
        context_features_image_id_list=context_features_image_id_list)

    context_feature_dict = seq_example.context.feature
    self.assertEqual(
        dataset_name,
        context_feature_dict['example/dataset_name'].bytes_list.value[0])
    self.assertEqual(
        timestamps[0],
        context_feature_dict['clip/start/timestamp'].int64_list.value[0])
    self.assertEqual(
        timestamps[-1],
        context_feature_dict['clip/end/timestamp'].int64_list.value[0])
    self.assertEqual(
        num_frames,
        context_feature_dict['clip/frames'].int64_list.value[0])

    self.assertAllClose(
        context_features,
        context_feature_dict['image/context_features'].float_list.value[:])
    self.assertEqual(
        context_feature_length[0],
        context_feature_dict[
            'image/context_feature_length'].int64_list.value[0])
    self.assertEqual(
        context_features_image_id_list,
        context_feature_dict[
            'image/context_features_image_id_list'].bytes_list.value[:])

    seq_feature_dict = seq_example.feature_lists.feature_list
    self.assertLen(
        seq_feature_dict['image/encoded'].feature[:],
        num_frames)
    actual_timestamps = [
        feature.int64_list.value[0] for feature
        in seq_feature_dict['image/timestamp'].feature]
    self.assertAllEqual(timestamps, actual_timestamps)
    # Frame 0.
    self.assertAllEqual(
        is_annotated[0],
        seq_feature_dict['region/is_annotated'].feature[0].int64_list.value[0])
    self.assertAllClose(
        [0., 0.],
        seq_feature_dict['region/bbox/ymin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0., 0.],
        seq_feature_dict['region/bbox/xmin'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0., 1.],
        seq_feature_dict['region/bbox/ymax'].feature[0].float_list.value[:])
    self.assertAllClose(
        [0., 1.],
        seq_feature_dict['region/bbox/xmax'].feature[0].float_list.value[:])
    self.assertAllEqual(
        labels,
        seq_feature_dict['region/label/string'].feature[0].bytes_list.value[:])

    # Frame 1.
    self.assertAllEqual(
        is_annotated[1],
        seq_feature_dict['region/is_annotated'].feature[1].int64_list.value[0])
    self.assertAllClose(
        [],
        seq_feature_dict['region/bbox/ymin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict['region/bbox/xmin'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict['region/bbox/ymax'].feature[1].float_list.value[:])
    self.assertAllClose(
        [],
        seq_feature_dict['region/bbox/xmax'].feature[1].float_list.value[:])
    self.assertAllEqual(
        [],
        seq_feature_dict['region/label/string'].feature[1].bytes_list.value[:])
Пример #8
0
    def _generate_sequence_examples(self, annotation_file, excluded_file,
                                    label_map, seconds_per_sequence,
                                    hop_between_sequences,
                                    video_path_format_string):
        """For each row in the annotation CSV, generates corresponding examples.

    When iterating through frames for a single sequence example, skips over
    excluded frames. When moving to the next sequence example, also skips over
    excluded frames as if they don't exist. Generates equal-length sequence
    examples, each with length seconds_per_sequence (1 fps) and gaps of
    hop_between_sequences frames (and seconds) between them, possible greater
    due to excluded frames.

    Args:
      annotation_file: path to the file of AVA CSV annotations.
      excluded_file: path to a CSV file of excluded timestamps for each video.
      label_map: an {int: string} label map.
      seconds_per_sequence: The number of seconds per example in each example.
      hop_between_sequences: The hop between sequences. If less than
          seconds_per_sequence, will overlap.
      video_path_format_string: File path format to glob video files.

    Yields:
      Each prepared tf.SequenceExample of metadata also containing video frames
    """
        fieldnames = [
            'id', 'timestamp_seconds', 'xmin', 'ymin', 'xmax', 'ymax',
            'action_label'
        ]
        frame_excluded = {}
        # create a sparse, nested map of videos and frame indices.
        with open(excluded_file, 'r') as excluded:
            reader = csv.reader(excluded)
            for row in reader:
                frame_excluded[(row[0], int(float(row[1])))] = True
        with open(annotation_file, 'r') as annotations:
            reader = csv.DictReader(annotations, fieldnames)
            frame_annotations = collections.defaultdict(list)
            ids = set()
            # aggreggate by video and timestamp:
            for row in reader:
                ids.add(row['id'])
                key = (row['id'], int(float(row['timestamp_seconds'])))
                frame_annotations[key].append(row)
            # for each video, find aggregates near each sampled frame.:
            logging.info('Generating metadata...')
            media_num = 1
            for media_id in ids:
                logging.info('%d/%d, ignore warnings.\n', media_num, len(ids))
                media_num += 1

                filepath = glob.glob(
                    video_path_format_string.format(media_id) + '*')[0]
                cur_vid = cv2.VideoCapture(filepath)
                width = cur_vid.get(cv2.CAP_PROP_FRAME_WIDTH)
                height = cur_vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
                middle_frame_time = POSSIBLE_TIMESTAMPS[0]
                while middle_frame_time < POSSIBLE_TIMESTAMPS[-1]:
                    start_time = middle_frame_time - seconds_per_sequence // 2 - (
                        0 if seconds_per_sequence % 2 == 0 else 1)
                    end_time = middle_frame_time + (seconds_per_sequence // 2)

                    total_boxes = []
                    total_labels = []
                    total_label_strings = []
                    total_images = []
                    total_source_ids = []
                    total_confidences = []
                    total_is_annotated = []
                    windowed_timestamp = start_time

                    while windowed_timestamp < end_time:
                        if (media_id, windowed_timestamp) in frame_excluded:
                            end_time += 1
                            windowed_timestamp += 1
                            logging.info(
                                'Ignoring and skipping excluded frame.')
                            continue

                        cur_vid.set(cv2.CAP_PROP_POS_MSEC,
                                    (windowed_timestamp) * SECONDS_TO_MILLI)
                        _, image = cur_vid.read()
                        _, buffer = cv2.imencode('.jpg', image)

                        bufstring = buffer.tostring()
                        total_images.append(bufstring)
                        source_id = str(windowed_timestamp) + '_' + media_id
                        total_source_ids.append(source_id)
                        total_is_annotated.append(1)

                        boxes = []
                        labels = []
                        label_strings = []
                        confidences = []
                        for row in frame_annotations[(media_id,
                                                      windowed_timestamp)]:
                            if len(row) > 2 and int(
                                    row['action_label']) in label_map:
                                boxes.append([
                                    float(row['ymin']),
                                    float(row['xmin']),
                                    float(row['ymax']),
                                    float(row['xmax'])
                                ])
                                labels.append(int(row['action_label']))
                                label_strings.append(label_map[int(
                                    row['action_label'])])
                                confidences.append(1)
                            else:
                                logging.warning('Unknown label: %s',
                                                row['action_label'])

                        total_boxes.append(boxes)
                        total_labels.append(labels)
                        total_label_strings.append(label_strings)
                        total_confidences.append(confidences)
                        windowed_timestamp += 1

                    if total_boxes:
                        yield seq_example_util.make_sequence_example(
                            'AVA',
                            media_id,
                            total_images,
                            int(height),
                            int(width),
                            'jpeg',
                            total_source_ids,
                            None,
                            total_is_annotated,
                            total_boxes,
                            total_label_strings,
                            use_strs_for_source_id=True)

                    # Move middle_time_frame, skipping excluded frames
                    frames_mv = 0
                    frames_excluded_count = 0
                    while (frames_mv <
                           hop_between_sequences + frames_excluded_count
                           and middle_frame_time + frames_mv <
                           POSSIBLE_TIMESTAMPS[-1]):
                        frames_mv += 1
                        if (media_id, windowed_timestamp +
                                frames_mv) in frame_excluded:
                            frames_excluded_count += 1
                    middle_frame_time += frames_mv

                cur_vid.release()