예제 #1
0
    def _generate_metadata(self,
                           key,
                           download_output,
                           video_path_format_string=None):
        """For each row in the annotation CSV, generates the corresponding metadata.

    Args:
      key: which split to process.
      download_output: the tuple output of _download_data containing
        - annotations_files: dict of keys to CSV annotation paths.
        - label_map: dict mapping from label strings to numeric indices.
      video_path_format_string: The format string for the path to local files.
    Yields:
      Each tf.SequenceExample of metadata, ready to pass to MediaPipe.
    """
        annotations_files, label_map = download_output
        with open(annotations_files[key], "r") as annotations:
            reader = csv.reader(annotations)
            for i, csv_row in enumerate(reader):
                if i == 0:  # the first row is the header
                    continue
                # rename the row with a constitent set of names.
                if len(csv_row) == 5:
                    row = dict(
                        list(
                            zip([
                                "label_name", "video", "start", "end", "split"
                            ], csv_row)))
                else:
                    row = dict(
                        list(zip(["video", "start", "end", "split"], csv_row)))
                metadata = tf.train.SequenceExample()
                ms.set_example_id(bytes23(row["video"] + "_" + row["start"]),
                                  metadata)
                ms.set_clip_media_id(bytes23(row["video"]), metadata)
                ms.set_clip_alternative_media_id(bytes23(row["split"]),
                                                 metadata)
                if video_path_format_string:
                    filepath = video_path_format_string.format(**row)
                    ms.set_clip_data_path(bytes23(filepath), metadata)
                assert row["start"].isdigit(), "Invalid row: %s" % str(row)
                assert row["end"].isdigit(), "Invalid row: %s" % str(row)
                if "label_name" in row:
                    ms.set_clip_label_string([bytes23(row["label_name"])],
                                             metadata)
                    if label_map:
                        ms.set_clip_label_index([label_map[row["label_name"]]],
                                                metadata)
                yield metadata
 def test_expected_functions_are_defined(self):
     # The code from media_sequence_util is already tested, but this test ensures
     # that we actually generate the expected methods. We only test one per
     # feature and the only test is to not crash with undefined attributes. By
     # passing in a value, we also ensure that the types are correct because the
     # underlying code crashes with a type mismatch.
     example = tf.train.SequenceExample()
     # context
     ms.set_example_id(b"string", example)
     ms.set_example_dataset_name(b"string", example)
     ms.set_clip_media_id(b"string", example)
     ms.set_clip_alternative_media_id(b"string", example)
     ms.set_clip_encoded_media_bytes(b"string", example)
     ms.set_clip_encoded_media_start_timestamp(47, example)
     ms.set_clip_data_path(b"string", example)
     ms.set_clip_start_timestamp(47, example)
     ms.set_clip_end_timestamp(47, example)
     ms.set_clip_label_string((b"string", b"test"), example)
     ms.set_clip_label_index((47, 49), example)
     ms.set_clip_label_confidence((0.47, 0.49), example)
     ms.set_segment_start_timestamp((47, 49), example)
     ms.set_segment_start_index((47, 49), example)
     ms.set_segment_end_timestamp((47, 49), example)
     ms.set_segment_end_index((47, 49), example)
     ms.set_segment_label_index((47, 49), example)
     ms.set_segment_label_string((b"test", b"strings"), example)
     ms.set_segment_label_confidence((0.47, 0.49), example)
     ms.set_image_format(b"test", example)
     ms.set_image_channels(47, example)
     ms.set_image_colorspace(b"test", example)
     ms.set_image_height(47, example)
     ms.set_image_width(47, example)
     ms.set_image_frame_rate(0.47, example)
     ms.set_image_data_path(b"test", example)
     ms.set_forward_flow_format(b"test", example)
     ms.set_forward_flow_channels(47, example)
     ms.set_forward_flow_colorspace(b"test", example)
     ms.set_forward_flow_height(47, example)
     ms.set_forward_flow_width(47, example)
     ms.set_forward_flow_frame_rate(0.47, example)
     ms.set_class_segmentation_format(b"test", example)
     ms.set_class_segmentation_height(47, example)
     ms.set_class_segmentation_width(47, example)
     ms.set_class_segmentation_class_label_string((b"test", b"strings"),
                                                  example)
     ms.set_class_segmentation_class_label_index((47, 49), example)
     ms.set_instance_segmentation_format(b"test", example)
     ms.set_instance_segmentation_height(47, example)
     ms.set_instance_segmentation_width(47, example)
     ms.set_instance_segmentation_object_class_index((47, 49), example)
     ms.set_bbox_parts((b"HEAD", b"TOE"), example)
     # feature lists
     ms.add_image_encoded(b"test", example)
     ms.add_image_multi_encoded([b"test", b"test"], example)
     ms.add_image_timestamp(47, example)
     ms.add_forward_flow_encoded(b"test", example)
     ms.add_forward_flow_multi_encoded([b"test", b"test"], example)
     ms.add_forward_flow_timestamp(47, example)
     ms.add_bbox_ymin((0.47, 0.49), example)
     ms.add_bbox_xmin((0.47, 0.49), example)
     ms.add_bbox_ymax((0.47, 0.49), example)
     ms.add_bbox_xmax((0.47, 0.49), example)
     ms.add_bbox_point_x((0.47, 0.49), example)
     ms.add_bbox_point_y((0.47, 0.49), example)
     ms.add_predicted_bbox_ymin((0.47, 0.49), example)
     ms.add_predicted_bbox_xmin((0.47, 0.49), example)
     ms.add_predicted_bbox_ymax((0.47, 0.49), example)
     ms.add_predicted_bbox_xmax((0.47, 0.49), example)
     ms.add_bbox_num_regions(47, example)
     ms.add_bbox_is_annotated(47, example)
     ms.add_bbox_is_generated((47, 49), example)
     ms.add_bbox_is_occluded((47, 49), example)
     ms.add_bbox_label_index((47, 49), example)
     ms.add_bbox_label_string((b"test", b"strings"), example)
     ms.add_bbox_label_confidence((0.47, 0.49), example)
     ms.add_bbox_class_index((47, 49), example)
     ms.add_bbox_class_string((b"test", b"strings"), example)
     ms.add_bbox_class_confidence((0.47, 0.49), example)
     ms.add_bbox_track_index((47, 49), example)
     ms.add_bbox_track_string((b"test", b"strings"), example)
     ms.add_bbox_track_confidence((0.47, 0.49), example)
     ms.add_bbox_timestamp(47, example)
     ms.add_predicted_bbox_class_index((47, 49), example)
     ms.add_predicted_bbox_class_string((b"test", b"strings"), example)
     ms.add_predicted_bbox_timestamp(47, example)
     ms.add_class_segmentation_encoded(b"test", example)
     ms.add_class_segmentation_multi_encoded([b"test", b"test"], example)
     ms.add_instance_segmentation_encoded(b"test", example)
     ms.add_instance_segmentation_multi_encoded([b"test", b"test"], example)
     ms.add_class_segmentation_timestamp(47, example)
     ms.set_bbox_embedding_dimensions_per_region((47, 49), example)
     ms.set_bbox_embedding_format(b"test", example)
     ms.add_bbox_embedding_floats((0.47, 0.49), example)
     ms.add_bbox_embedding_encoded((b"text", b"stings"), example)
     ms.add_bbox_embedding_confidence((0.47, 0.49), example)