def test_can_extract_mnist(self): with mock_tfds_data(): tfds_ds, tfds_info = tfds.load('mnist', split='train', with_info=True) tfds_example = next(iter(tfds_ds)) expected_dataset = Dataset.from_iterable( [ DatasetItem( id='0', subset='train', image=tfds_example['image'].numpy().squeeze(axis=2), annotations=[Label(tfds_example['label'].numpy())], ), ], categories=tfds_info.features['label'].names) extractor = make_tfds_extractor('mnist') actual_dataset = Dataset(extractor) compare_datasets(self, expected_dataset, actual_dataset, require_images=True)
def test_can_extract_voc(self): # TFDS is unable to generate fake examples for object detection # datasets. See <https://github.com/tensorflow/datasets/issues/3633>. tfds_example = { 'image/filename': 'test.png', 'image': encode_image(np.ones((20, 10)), '.png'), 'objects': { 'bbox': [[0.1, 0.2, 0.3, 0.4]], 'label': [5], 'is_difficult': [True], 'is_truncated': [False], 'pose': [0], } } with mock_tfds_data(example=tfds_example): tfds_info = tfds.builder('voc/2012').info pose_names = tfds_info.features['objects'].feature['pose'].names expected_dataset = Dataset.from_iterable( [ DatasetItem( id='test', subset='train', image=np.ones((20, 10)), annotations=[ Bbox(2, 2, 2, 4, label=5, attributes={ 'difficult': True, 'truncated': False, 'pose': pose_names[0].title(), }), ], ), ], categories=tfds_info.features['objects'].feature['label'].names ) extractor = make_tfds_extractor('voc/2012') actual_dataset = Dataset(extractor) compare_datasets(self, expected_dataset, actual_dataset, require_images=True)
def download_command(args): env = Environment() if args.dataset_id.startswith('tfds:'): if TFDS_EXTRACTOR_AVAILABLE: tfds_ds_name = args.dataset_id[5:] tfds_ds_metadata = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name) if tfds_ds_metadata: default_converter_name = tfds_ds_metadata.default_converter_name extractor_factory = lambda: make_tfds_extractor(tfds_ds_name) else: raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'") else: raise CliException( "TFDS datasets are not available, because TFDS and/or " "TensorFlow are not installed.\n" "You can install them with: pip install datumaro[tf,tfds]") else: raise CliException(f"Unknown dataset ID '{args.dataset_id}'") output_format = args.output_format or default_converter_name try: converter = env.converters[output_format] except KeyError: raise CliException("Converter for format '%s' is not found" % output_format) extra_args = converter.parse_cmdline(args.extra_args) dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('%s-%s' % ( make_file_name(args.dataset_id), make_file_name(output_format), )) dst_dir = osp.abspath(dst_dir) log.info("Downloading the dataset") extractor = extractor_factory() log.info("Exporting the dataset") converter.convert(extractor, dst_dir, default_image_ext='.png', **extra_args) log.info("Dataset exported to '%s' as '%s'" % (dst_dir, output_format))
def test_can_extract_coco(self): tfds_example = { 'image': encode_image(np.ones((20, 10)), '.png'), 'image/filename': 'test.png', 'image/id': 123, 'objects': { 'bbox': [[0.1, 0.2, 0.3, 0.4]], 'label': [5], 'is_crowd': [True], } } with mock_tfds_data(example=tfds_example): tfds_info = tfds.builder('coco/2014').info expected_dataset = Dataset.from_iterable( [ DatasetItem( id='test', subset='train', image=np.ones((20, 10)), annotations=[ Bbox(2, 2, 2, 4, label=5, attributes={'is_crowd': True}), ], attributes={'id': 123}, ), ], categories=tfds_info.features['objects'].feature['label'].names ) extractor = make_tfds_extractor('coco/2014') actual_dataset = Dataset(extractor) compare_datasets(self, expected_dataset, actual_dataset, require_images=True)
def test_data_access(self): with mock_tfds_data(): extractor = make_tfds_extractor('mnist') self.assertEqual(len(extractor), 1) train_subset = extractor.get_subset('train') compare_datasets(self, Dataset(extractor), Dataset(train_subset)) self.assertRaises(KeyError, extractor.get_subset, 'test') subsets = extractor.subsets() self.assertEqual(len(subsets), 1) self.assertIn('train', subsets) compare_datasets(self, Dataset(extractor), Dataset(subsets['train'])) self.assertIsNotNone(extractor.get('0')) self.assertIsNotNone(extractor.get('0', subset='train')) self.assertIsNone(extractor.get('x')) self.assertIsNone(extractor.get('0', subset='test'))
def test_can_extract_imagenet_v2(self): with mock_tfds_data(): tfds_ds, tfds_info = tfds.load( 'imagenet_v2', split='train', with_info=True, # We can't let TFDS decode the image for us, because: # a) imagenet_v2 produces JPEG-encoded images; # b) TFDS decodes them via TensorFlow; # c) Datumaro decodes them via OpenCV. # So for the decoded results to match, we have to decode # them via OpenCV as well. decoders={'image': tfds.decode.SkipDecoding()}) tfds_example = next(iter(tfds_ds)) example_file_name = tfds_example['file_name'].numpy().decode( 'UTF-8') expected_dataset = Dataset.from_iterable( [ DatasetItem( id=osp.splitext(example_file_name)[0], subset='train', image=Image( data=decode_image(tfds_example['image'].numpy()), path=example_file_name, ), annotations=[Label(tfds_example['label'].numpy())], ), ], categories=tfds_info.features['label'].names) extractor = make_tfds_extractor('imagenet_v2') actual_dataset = Dataset(extractor) compare_datasets(self, expected_dataset, actual_dataset, require_images=True)
def _test_can_extract_cifar(self, name): with mock_tfds_data(): tfds_ds, tfds_info = tfds.load(name, split='train', with_info=True) tfds_example = next(iter(tfds_ds)) expected_dataset = Dataset.from_iterable( [ DatasetItem( id=tfds_example['id'].numpy().decode('UTF-8'), subset='train', image=tfds_example['image'].numpy()[..., ::-1], annotations=[Label(tfds_example['label'].numpy())], ), ], categories=tfds_info.features['label'].names) extractor = make_tfds_extractor(name) actual_dataset = Dataset(extractor) compare_datasets(self, expected_dataset, actual_dataset, require_images=True)