def test_can_extract_mnist(self):
        with mock_tfds_data():
            tfds_ds, tfds_info = tfds.load('mnist',
                                           split='train',
                                           with_info=True)
            tfds_example = next(iter(tfds_ds))

            expected_dataset = Dataset.from_iterable(
                [
                    DatasetItem(
                        id='0',
                        subset='train',
                        image=tfds_example['image'].numpy().squeeze(axis=2),
                        annotations=[Label(tfds_example['label'].numpy())],
                    ),
                ],
                categories=tfds_info.features['label'].names)

            extractor = make_tfds_extractor('mnist')
            actual_dataset = Dataset(extractor)

            compare_datasets(self,
                             expected_dataset,
                             actual_dataset,
                             require_images=True)
    def test_can_extract_voc(self):
        # TFDS is unable to generate fake examples for object detection
        # datasets. See <https://github.com/tensorflow/datasets/issues/3633>.
        tfds_example = {
            'image/filename': 'test.png',
            'image': encode_image(np.ones((20, 10)), '.png'),
            'objects': {
                'bbox': [[0.1, 0.2, 0.3, 0.4]],
                'label': [5],
                'is_difficult': [True],
                'is_truncated': [False],
                'pose': [0],
            }
        }

        with mock_tfds_data(example=tfds_example):
            tfds_info = tfds.builder('voc/2012').info

            pose_names = tfds_info.features['objects'].feature['pose'].names

            expected_dataset = Dataset.from_iterable(
                [
                    DatasetItem(
                        id='test',
                        subset='train',
                        image=np.ones((20, 10)),
                        annotations=[
                            Bbox(2,
                                 2,
                                 2,
                                 4,
                                 label=5,
                                 attributes={
                                     'difficult': True,
                                     'truncated': False,
                                     'pose': pose_names[0].title(),
                                 }),
                        ],
                    ),
                ],
                categories=tfds_info.features['objects'].feature['label'].names
            )

            extractor = make_tfds_extractor('voc/2012')
            actual_dataset = Dataset(extractor)

            compare_datasets(self,
                             expected_dataset,
                             actual_dataset,
                             require_images=True)
示例#3
0
def download_command(args):
    env = Environment()

    if args.dataset_id.startswith('tfds:'):
        if TFDS_EXTRACTOR_AVAILABLE:
            tfds_ds_name = args.dataset_id[5:]
            tfds_ds_metadata = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
            if tfds_ds_metadata:
                default_converter_name = tfds_ds_metadata.default_converter_name
                extractor_factory = lambda: make_tfds_extractor(tfds_ds_name)
            else:
                raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'")
        else:
            raise CliException(
                "TFDS datasets are not available, because TFDS and/or "
                    "TensorFlow are not installed.\n"
                "You can install them with: pip install datumaro[tf,tfds]")
    else:
        raise CliException(f"Unknown dataset ID '{args.dataset_id}'")

    output_format = args.output_format or default_converter_name

    try:
        converter = env.converters[output_format]
    except KeyError:
        raise CliException("Converter for format '%s' is not found" %
            output_format)
    extra_args = converter.parse_cmdline(args.extra_args)

    dst_dir = args.dst_dir
    if dst_dir:
        if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
            raise CliException("Directory '%s' already exists "
                "(pass --overwrite to overwrite)" % dst_dir)
    else:
        dst_dir = generate_next_file_name('%s-%s' % (
            make_file_name(args.dataset_id),
            make_file_name(output_format),
        ))
    dst_dir = osp.abspath(dst_dir)

    log.info("Downloading the dataset")
    extractor = extractor_factory()

    log.info("Exporting the dataset")
    converter.convert(extractor, dst_dir,
        default_image_ext='.png', **extra_args)

    log.info("Dataset exported to '%s' as '%s'" % (dst_dir, output_format))
    def test_can_extract_coco(self):
        tfds_example = {
            'image': encode_image(np.ones((20, 10)), '.png'),
            'image/filename': 'test.png',
            'image/id': 123,
            'objects': {
                'bbox': [[0.1, 0.2, 0.3, 0.4]],
                'label': [5],
                'is_crowd': [True],
            }
        }

        with mock_tfds_data(example=tfds_example):
            tfds_info = tfds.builder('coco/2014').info

            expected_dataset = Dataset.from_iterable(
                [
                    DatasetItem(
                        id='test',
                        subset='train',
                        image=np.ones((20, 10)),
                        annotations=[
                            Bbox(2,
                                 2,
                                 2,
                                 4,
                                 label=5,
                                 attributes={'is_crowd': True}),
                        ],
                        attributes={'id': 123},
                    ),
                ],
                categories=tfds_info.features['objects'].feature['label'].names
            )

            extractor = make_tfds_extractor('coco/2014')
            actual_dataset = Dataset(extractor)

            compare_datasets(self,
                             expected_dataset,
                             actual_dataset,
                             require_images=True)
    def test_data_access(self):
        with mock_tfds_data():
            extractor = make_tfds_extractor('mnist')
            self.assertEqual(len(extractor), 1)

            train_subset = extractor.get_subset('train')
            compare_datasets(self, Dataset(extractor), Dataset(train_subset))

            self.assertRaises(KeyError, extractor.get_subset, 'test')

            subsets = extractor.subsets()
            self.assertEqual(len(subsets), 1)
            self.assertIn('train', subsets)
            compare_datasets(self, Dataset(extractor),
                             Dataset(subsets['train']))

            self.assertIsNotNone(extractor.get('0'))
            self.assertIsNotNone(extractor.get('0', subset='train'))
            self.assertIsNone(extractor.get('x'))
            self.assertIsNone(extractor.get('0', subset='test'))
    def test_can_extract_imagenet_v2(self):
        with mock_tfds_data():
            tfds_ds, tfds_info = tfds.load(
                'imagenet_v2',
                split='train',
                with_info=True,
                # We can't let TFDS decode the image for us, because:
                # a) imagenet_v2 produces JPEG-encoded images;
                # b) TFDS decodes them via TensorFlow;
                # c) Datumaro decodes them via OpenCV.
                # So for the decoded results to match, we have to decode
                # them via OpenCV as well.
                decoders={'image': tfds.decode.SkipDecoding()})
            tfds_example = next(iter(tfds_ds))

            example_file_name = tfds_example['file_name'].numpy().decode(
                'UTF-8')

            expected_dataset = Dataset.from_iterable(
                [
                    DatasetItem(
                        id=osp.splitext(example_file_name)[0],
                        subset='train',
                        image=Image(
                            data=decode_image(tfds_example['image'].numpy()),
                            path=example_file_name,
                        ),
                        annotations=[Label(tfds_example['label'].numpy())],
                    ),
                ],
                categories=tfds_info.features['label'].names)

            extractor = make_tfds_extractor('imagenet_v2')
            actual_dataset = Dataset(extractor)

            compare_datasets(self,
                             expected_dataset,
                             actual_dataset,
                             require_images=True)
    def _test_can_extract_cifar(self, name):
        with mock_tfds_data():
            tfds_ds, tfds_info = tfds.load(name, split='train', with_info=True)
            tfds_example = next(iter(tfds_ds))

            expected_dataset = Dataset.from_iterable(
                [
                    DatasetItem(
                        id=tfds_example['id'].numpy().decode('UTF-8'),
                        subset='train',
                        image=tfds_example['image'].numpy()[..., ::-1],
                        annotations=[Label(tfds_example['label'].numpy())],
                    ),
                ],
                categories=tfds_info.features['label'].names)

            extractor = make_tfds_extractor(name)
            actual_dataset = Dataset(extractor)

            compare_datasets(self,
                             expected_dataset,
                             actual_dataset,
                             require_images=True)