def make_module_to_builder_dict(datasets=None): """Get all builders organized by module in nested dicts.""" # pylint: disable=g-long-lambda # dict to hold tfds->image->mnist->[builders] module_to_builder = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict(list))) # pylint: enable=g-long-lambda if not datasets: datasets = [ name for name in tfds.list_builders() if name not in BUILDER_BLACKLIST ] print("Creating the vanilla builders for %s datasets..." % len(datasets)) with futures.ThreadPoolExecutor( max_workers=WORKER_COUNT_DATASETS) as tpool: builders = tpool.map(tfds.builder, datasets) print("Vanilla builders built, constructing module_to_builder dict...") for builder in builders: module_name = builder.__class__.__module__ modules = module_name.split(".") if "testing" in modules: continue current_mod_ctr = module_to_builder for mod in modules: current_mod_ctr = current_mod_ctr[mod] current_mod_ctr.append(builder) module_to_builder = module_to_builder["tensorflow_datasets"] return module_to_builder
def refactor_datasets() -> None: """Refactoring all dataset into one folder.""" for ds_name in ( FLAGS.datasets.split(',') or tfds.list_builders(with_community_datasets=False) ): refactor_dataset(ds_name)
def __init__(self, name, data_dir, image_size, download=False, num_max_boxes=None, *args, **kwargs): super().__init__( *args, **kwargs, ) if name in tfds.list_builders(): self._builder = tfds.builder(name, data_dir=data_dir) if download: self._builder.download_and_prepare() else: if not tf.io.gfile.exists(os.path.join(data_dir, name)): raise ValueError( "Dataset directory does not exist: {}\n" "Please run `python blueoil/cmd/build_tfds.py -c <config file>` before training." .format(os.path.join(data_dir, name))) self._builder = self.builder_class(name, data_dir=data_dir) self.info = self._builder.info self._init_available_splits() self._validate_feature_structure() self.tf_dataset = self._builder.as_dataset( split=self.available_splits[self.subset]) self._image_size = image_size self._num_max_boxes = num_max_boxes self._format_dataset()
def main(_): # Legacy datasets urls = set(tfds.core.download.checksums.get_all_url_infos().keys()) # Dataset-as-folder datasets # Could keep track of the dataset name, so the report clearly indicates which # dataset should be updated. url_infos = { name: tfds.builder_cls(name).url_infos for name in tfds.list_builders(with_community_datasets=False) } for url_info in url_infos.values(): if url_info: urls |= url_info.keys() urls = sorted(urls) with futures.ThreadPoolExecutor(max_workers=100) as executor: all_codes = executor.map(_get_status_code, urls) print('\n************ Summary ************\n') total_errors = 0 for url, code in zip(urls, all_codes): if code == requests.codes.ok: continue total_errors += 1 print(f'{url} - status code: {code}') print(f'{total_errors} URLs had issues')
def build(self, split=None): if self.glob_path in tfds.list_builders(): return tfds.load(name=self.glob_path, split=split, with_info=True, as_supervised=True, try_gcs=tfds.is_dataset_on_gcs(self.glob_path)) files = tf.io.gfile.glob(self.glob_path) if len(files) == 0: raise ValueError('No file found') try: num = reduce( lambda x, y: x + y, map(lambda file: self._get_num_from_name(file), files)) except Exception: raise ValueError( 'Please format file name like <name>_<number>.<extension>') else: tfrecords = list( filter(lambda file: file.endswith('.tfrecords'), files)) txts = list(filter(lambda file: file.endswith('.txt'), files)) if len(tfrecords) > 0: tfrecords_dataset = self._dataset_internal( tfrecords, tf.data.TFRecordDataset, self.parse_tfrecord) if len(txts) > 0: txts_dataset = self._dataset_internal(txts, tf.data.TextLineDataset, self.parse_text) if len(tfrecords) > 0 and len(txts) > 0: return tfrecords_dataset.concatenate(txts_dataset), num elif len(tfrecords) > 0: return tfrecords_dataset, num elif len(txts) > 0: return txts_dataset, num
def main(_): if FLAGS.debug_start: pdb.set_trace() if FLAGS.sleep_start: time.sleep(60 * 60 * 3) datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",") or tfds.list_builders()) datasets_to_build -= set(FLAGS.exclude_datasets.split(",")) logging.info("Running download_and_prepare for datasets:\n%s", "\n".join(datasets_to_build)) builders = { name: tfds.builder(name, data_dir=FLAGS.data_dir) for name in datasets_to_build } for name, builder in builders.items(): if builder.BUILDER_CONFIGS and "/" not in name: # If builder has multiple configs, and no particular config was # requested, then compute all. for config in builder.BUILDER_CONFIGS: builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config) download_and_prepare(builder_for_config) else: # If there is a slash in the name, then user requested a specific # dataset configuration. download_and_prepare(builder)
def make_category_to_builders_dict( datasets: Optional[List[str]] = None, ) -> Dict[str, List[tfds.core.DatasetBuilder]]: """Returns the `Dict[dataset_type, List[Builder]]`.""" if not datasets: datasets = [ name for name in tfds.list_builders() if name not in BUILDER_BLACKLIST ] print('Creating the vanilla builders for %s datasets...' % len(datasets)) with futures.ThreadPoolExecutor( max_workers=WORKER_COUNT_DATASETS) as tpool: builders = tpool.map(tfds.builder, datasets) print('Vanilla builders built, constructing module_to_builder dict...') # Dict[dataset_type, List[Builder]] category_to_builders = collections.defaultdict(list) for builder in builders: module = type(builder).__module__ if not module.startswith('tensorflow_datasets.'): raise AssertionError(f'Unexpected builder {type(builder)}: module') module_parts = module.split('.') if 'testing' in module_parts: continue _, category, *_ = module_parts # tfds.<category>.xyz category_to_builders[category].append(builder) return category_to_builders
def make_module_to_builder_dict(): """Get all builders organized by module in nested dicts.""" # pylint: disable=g-long-lambda # dict to hold tfds->image->mnist->[builders] module_to_builder = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict(list))) # pylint: enable=g-long-lambda builders = [ tfds.builder(name) for name in tfds.list_builders() if name not in BUILDER_BLACKLIST ] + [ tfds.builder("image_label_folder", dataset_name="image_label_folder") ] for builder in builders: mod_name = builder.__class__.__module__ modules = mod_name.split(".") current_mod_ctr = module_to_builder for mod in modules: current_mod_ctr = current_mod_ctr[mod] current_mod_ctr.append(builder) module_to_builder = module_to_builder["tensorflow_datasets"] return module_to_builder
def main(_): if FLAGS.module_import: import_modules(FLAGS.module_import) if FLAGS.debug_start: pdb.set_trace() if FLAGS.sleep_start: time.sleep(60 * 60 * 3) if FLAGS.disable_tqdm: logging.info("Disabling tqdm.") tfds.disable_progress_bar() datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",") or tfds.list_builders()) datasets_to_build -= set(FLAGS.exclude_datasets.split(",")) version = "experimental_latest" if FLAGS.experimental_latest_version else None logging.info("Running download_and_prepare for datasets:\n%s", "\n".join(datasets_to_build)) logging.info('Version: "%s"', version) builders = { name: tfds.builder(name, data_dir=FLAGS.data_dir, version=version) for name in datasets_to_build } if FLAGS.builder_config_id is not None: # Requesting a single config of a single dataset if len(builders) > 1: raise ValueError( "--builder_config_id can only be used when building a single dataset" ) builder = builders[list(builders.keys())[0]] if not builder.BUILDER_CONFIGS: raise ValueError( "--builder_config_id can only be used with datasets with configs" ) config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id] logging.info("Running download_and_prepare for config: %s", config.name) builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config, version=version) download_and_prepare(builder_for_config) else: for name, builder in builders.items(): if builder.BUILDER_CONFIGS and "/" not in name: # If builder has multiple configs, and no particular config was # requested, then compute all. for config in builder.BUILDER_CONFIGS: builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config, version=version) download_and_prepare(builder_for_config) else: # If there is a slash in the name, then user requested a specific # dataset configuration. download_and_prepare(builder)
def test_list_builder(self): test_datasets = { tfds.testing.DummyMnist.name, tfds.testing.DummyDatasetSharedGenerator.name, } registered_datasets = set(tfds.list_builders()) # The tests datasets should not be present in the registered datasets self.assertEmpty(test_datasets & registered_datasets)
def test_exclude_datasets(): # Exclude all datasets except 2 all_ds = [b for b in tfds.list_builders() if b not in ('mnist', 'cifar10')] all_ds_str = ','.join(all_ds) dl_and_prepare = _build(f'--exclude_datasets {all_ds_str}') assert dl_and_prepare.call_count == 2 with pytest.raises(ValueError, match='--exclude_datasets can\'t be used'): dl_and_prepare = _build('mnist --exclude_datasets cifar10')
def main(): print("Demonstration for using Imagenet2012 dataset with tensorflow datset") # List all the datasets provided in the tensorflow_datasets # print(tfds.list_builders()) # Step 1: get a dataset builder for the required dataset dataset_name = "imagenet2012" if dataset_name in tfds.list_builders(): imagenet_dataset_builder = tfds.builder(dataset_name) print("retrived " + dataset_name + " builder") else: return # get all the information regarding dataset print(imagenet_dataset_builder.info) print("Image shape", imagenet_dataset_builder.info.features['image'].shape) print("class",imagenet_dataset_builder.info.features['label'].num_classes) print("classname",imagenet_dataset_builder.info.features['label'].names) print("NrTrain",imagenet_dataset_builder.info.splits['train'].num_examples) print("Val",imagenet_dataset_builder.info.splits['validation'].num_examples) # Download and prepare the dataset internally # The dataset should be downloaded to ~/tensorflow-datasets/download # but for Imagenet case, we need to manually download the dataset and # specify the manual_dir where the downloaded files are kept. manual_dataset_dir = "/data/datasets" # The download_and_prepare function will assume that two files namely # ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar are present in # directory manual_dataset_dir + "/manual/imagenet2012" imagenet_download_config = tfds.download.DownloadConfig( manual_dir = manual_dataset_dir) # Conditionally, download config can be passed as second argument. imagenet_dataset_builder.download_and_prepare( download_dir = manual_dataset_dir) # Once this is complete (that just pre-process without downloading anything) # it will create a director "~/tensorflow_datasets/imagenet2012/2.0.0" # having 1000 train tfrecords and 5 validation tfrecords in addition to some # bookkeeping json and label txt files. # now, we get the tf.data.Dataset structure which tensorflow data-pipeline # understands and process in tf graph. imagenet_train = imagenet_dataset_builder.as_dataset(split=tfds.Split.TRAIN) assert isinstance(imagenet_train, tf.data.Dataset) imagenet_validation = imagenet_dataset_builder.as_dataset( split=tfds.Split.VALIDATION) assert isinstance(imagenet_validation, tf.data.Dataset) # Now we can peek into the sample images present in the dataset with take (imagenet_example,) = imagenet_train.take(1) # returns a dictionary img, label = imagenet_example["image"], imagenet_example["label"] # img and label are constant tensors, with numpy field containing numpy arry print("Image_shape", img.numpy().shape) print("Label_shape", label.numpy().shape) # print out the image file on the disk, and print the corresponding label imsave("image.png", img.numpy()) print("label", label.numpy())
class TFDS(): """Download and process datasets from tensorflow dataset""" AVAILABLE_DATASETS = tfds.list_builders() def __init__(self, name:str, seed=1234, data_dir:Optional[str]=None) -> None: (self.train_ds, self.val_ds, self.test_ds), self.metadata = tfds.load( name=name, split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'], with_info=True, as_supervised=True, data_dir=data_dir) self.total_imgs = self.metadata.splits['train'].num_examples self.num_trainImgs = int(self.total_imgs * 0.8) self.num_valImgs = int(self.total_imgs * 0.1) self.num_testImgs = int(self.total_imgs * 0.1) self.img_shape = next(iter(self.train_ds))[0].shape self.get_label_name = self.metadata.features['label'].int2str self.seed=seed self.Preprocess = Preprocess(seed=self.seed) def get_dataset(self, num_epochs:int=300, batch_size:int=32, input_shape:Iterable[int]=(32, 32, 3), seed=None): """[Generate train, val, test sets from tfds] Args: batch_size (int, optional): [number of images for each batch]. Defaults to 32. num_epochs (int, required): [number of epochs to run on experiment]. Defaults to 300. input_shape (tuple, optional): [shape of each image (h, w, c)]. Defaults to (32, 32, 3). seed ([type], optional): []. Defaults to None. Returns: [tuple]: [train, val, test sets] """ train_prepare_data_fn = functools.partial(self.Preprocess.preprocess, input_shape=input_shape) test_prepare_data_fn = functools.partial(self.Preprocess.preprocess, augment=False, input_shape=input_shape) train_ds = (self.train_ds .repeat(num_epochs) .shuffle(10000, seed=seed) .map(train_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) .batch(batch_size) .prefetch(tf.data.experimental.AUTOTUNE) ) val_ds = (self.val_ds .repeat(num_epochs) .map(test_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) .batch(batch_size) .prefetch(tf.data.experimental.AUTOTUNE) ) test_ds = self.test_ds.map(test_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) return (train_ds, val_ds, test_ds)
def test_exclude_datasets(): # Exclude all datasets except 2 all_ds = [b for b in tfds.list_builders() if b not in ('mnist', 'cifar10')] all_ds_str = ','.join(all_ds) assert _build(f'--exclude_datasets {all_ds_str}') == [ 'cifar10', 'mnist', ] with pytest.raises(ValueError, match='--exclude_datasets can\'t be used'): _build('mnist --exclude_datasets cifar10')
def get_tfds(dtype: str, data_dir: str = None, x_name="image", y_name="label", is_verbose=True, **kwargs): name = dtype_to_name(dtype) assert name in tfds.list_builders() data_dir = data_dir or os.path.join("~", "tfds", "{}_data".format( name.upper())) # e.g. ~/tfds/MNIST_data # https://www.tensorflow.org/datasets/datasets loaded, info = tfds.load( name=name, split=["train", "test"], data_dir=data_dir, batch_size=-1, with_info=True, ) if is_verbose: print(info) # Get numpy matrix train_and_validation, test = tfds.as_numpy(loaded) # Preprocess & Reshape train_and_validation_x, test_x = preprocess_xs( name, [train_and_validation[x_name], test[x_name]], **kwargs, ) # Training Validation Separation # this is necessary because tfds does not support validation separation. train_num, val_num = name_to_train_and_val_num(name) train_x = train_and_validation_x[:train_num] val_x = train_and_validation_x[-val_num:] # One hot labeling to_one_hot = get_to_one_hot(info.features[y_name].num_classes) data_label = DataLabel( train_labels=to_one_hot(train_and_validation[y_name][:train_num]), validation_labels=to_one_hot(train_and_validation[y_name][-val_num:]), test_labels=to_one_hot(test[y_name]), label_type=LabelType.ONE_LABELS_TO_ALL_TASK, ) return data_label, train_x, val_x, test_x
def _build_datasets(args: argparse.Namespace) -> None: """Build the given datasets.""" # Select datasets to generate datasets = (args.datasets or []) + (args.datasets_keyword or []) if args.exclude_datasets: # Generate all datasets if `--exclude_datasets` set if datasets: raise ValueError('--exclude_datasets can\'t be used with `datasets`') datasets = set(tfds.list_builders()) - set(args.exclude_datasets.split(',')) else: datasets = datasets or [''] # Empty string for default # Generate all datasets sequencially for ds_to_build in datasets: # Each `str` may correspond to multiple builder (e.g. multiple configs) for builder in _make_builders(args, ds_to_build): _download_and_prepare(args, builder)
def main(_): if FLAGS.debug_start: pdb.set_trace() if FLAGS.sleep_start: time.sleep(60 * 60 * 3) datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",") or tfds.list_builders()) datasets_to_build -= set(FLAGS.exclude_datasets.split(",")) logging.info("Running download_and_prepare for datasets:\n%s", "\n".join(datasets_to_build)) builders = { name: tfds.builder(name, data_dir=FLAGS.data_dir) for name in datasets_to_build } if FLAGS.builder_config_id is not None: # Requesting a single config of a single dataset if len(builders) > 1: raise ValueError( "--builder_config_id can only be used when building a single dataset" ) builder, = builders if not builder.BUILDER_CONFIGS: raise ValueError( "--builder_config_id can only be used with datasets with configs" ) config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id] builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config) download_and_prepare(builder_for_config) else: for name, builder in builders.items(): if builder.BUILDER_CONFIGS and "/" not in name: # If builder has multiple configs, and no particular config was # requested, then compute all. for config in builder.BUILDER_CONFIGS: builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config) download_and_prepare(builder_for_config) else: # If there is a slash in the name, then user requested a specific # dataset configuration. download_and_prepare(builder)
def _collect_path_to_url_infos( ) -> Dict[tfds.core.ReadWritePath, Dict[Url, checksums.UrlInfo]]: """Collect checksums paths to url_infos.""" # Collect legacy checksums paths url_info_paths = list(checksums._checksum_paths().values()) # pylint: disable=protected-access # Collect dataset-as-folder checksums path for name in tfds.list_builders(): url_info_path = tfds.builder_cls(name)._checksums_path # pylint: disable=protected-access if url_info_path.exists(): url_info_paths.append(url_info_path) url_info_paths = [tfds.core.utils.to_write_path(p) for p in url_info_paths] return { path: typing.cast(Dict[Url, checksums.UrlInfo], checksums.load_url_infos(path)) for path in url_info_paths }
def test_glue_load(self): hparams = Hparams() hparams.load_from_config_file("../configs/qa/dureader_yesno.yml") hparams.stand_by() checksum_dir = "../aispace/datasets/url_checksums" tfds.download.add_checksums_dir(checksum_dir) download_config = DownloadConfig(register_checksums=True) print(tfds.list_builders()) dureader = tfds.load( "dureader/yesno", # data_dir="/search/data1/yyk/data/datasets/glue_zh", data_dir="../data/dureader", builder_kwargs={'hparams': hparams}, download_and_prepare_kwargs={'download_config': download_config}) for itm in dureader['train']: print(itm) break print() # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False)) # test_dataset = next(load_dataset(hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True))[0] # total, zero = 0, 0 # for itm in tqdm(test_dataset): # tt = itm[0]['input_ids'].numpy().tolist() # print(itm[0]['p_mask'].numpy().tolist()) # print(itm[0]['start_position'].numpy().tolist()) # print(itm[0]['end_position'].numpy().tolist()) # print(tt) # break # total += 1 # zero += len([t for t in tt if t == 0]) # print() # print(f"{zero}, {total}, {zero / float(total)}") # print(total) # python -u aispace/trainer.py \ # --experiment_name test \ # --model_name bert_for_classification \ # --schedule train_and_eval \ # --config_name tnews \ # --config_dir ./configs/glue_zh \ # --gpus 0 1 2 3
def _build_datasets(args: argparse.Namespace) -> None: """Build the given datasets.""" # Eventually register additional datasets imports if args.imports: list(importlib.import_module(m) for m in args.imports.split(',')) # Select datasets to generate datasets = (args.datasets or []) + (args.datasets_keyword or []) if args.exclude_datasets: # Generate all datasets if `--exclude_datasets` set if datasets: raise ValueError( '--exclude_datasets can\'t be used with `datasets`') datasets = (set(tfds.list_builders(with_community_datasets=False)) - set(args.exclude_datasets.split(','))) datasets = sorted(datasets) # `set` is not deterministic else: datasets = datasets or [''] # Empty string for default # Generate all datasets sequencially for ds_to_build in datasets: # Each `str` may correspond to multiple builder (e.g. multiple configs) for builder in _make_builders(args, ds_to_build): _download_and_prepare(args, builder)
def main(): pp = pprint.PrettyPrinter(indent=4) print("### looking up available datasets ###") builders: List[str] = tfds.list_builders() pp.pprint(builders) print("\n### the first 5 elements of train_ex ###") pipeline = SetupInputPipeline() pipeline.init_ted_hrlr() train_ex = pipeline.train_ex train_ex_numpy = ((pt.numpy(), en.numpy()) for (pt, en) in train_ex) # generator comprehension pp.pprint(list(itertools.islice(train_ex_numpy, 5))) print("\n### testing the english tokenizer ###") pipeline.init_subwords_tokenizer() tokenizer_en = pipeline.tokenizer_en sample_string = "Transformer is awesome" tokenized_string = tokenizer_en.encode( s=sample_string) # this won't terminate? print("Sample string: {}\nEncoded string: {}".format( sample_string, tokenized_string)) print( "\n### if a word does not exist in vocab, it is tokenized into subwords ###" ) for ts in tokenized_string: print('{} ----> {}'.format(ts, tokenizer_en.decode([ts]))) print("\n### the first 5 elements of the preprocessed training set ###") preproc_train_ex = pipeline.preproc_train_ex() preproc_train_ex_numpy = ((pt.numpy(), en.numpy()) for (pt, en) in preproc_train_ex) # have a look at the first batch (64 instances) pp.pprint(next(iter(preproc_train_ex_numpy)))
import collections import os import sys from absl import app import tensorflow as tf import tensorflow_datasets as tfds from tensorflow_datasets.core.utils import py_utils BASE_URL = "https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets" # ImageLabelFolder require an extra constructor arg so is handled separately # TODO(tfds): Document the manual_dir datasets in a separate section BUILDER_BLACKLIST = ["image_label_folder"] DOC = """\ <!-- auto-generated by tfds.scripts.document_datasets --> # Datasets ## Usage ``` # See all registered datasets tfds.list_builders() # Load a given dataset by name, along with the DatasetInfo data, info = tfds.load("mnist", with_info=True) train_data, test_data = data['test'], data['train'] assert isinstance(train_data, tf.data.Dataset) assert info.features['label'].num_classes == 10 assert info.splits['train'].num_examples == 60000
def main(_): if FLAGS.module_import: import_modules(FLAGS.module_import) if FLAGS.debug_start: pdb.set_trace() if FLAGS.sleep_start: time.sleep(60 * 60 * 3) if FLAGS.disable_tqdm: logging.info("Disabling tqdm.") tfds.disable_progress_bar() if FLAGS.checksums_dir: tfds.download.add_checksums_dir(FLAGS.checksums_dir) datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",") or tfds.list_builders()) datasets_to_build -= set(FLAGS.exclude_datasets.split(",")) # Only pass the version kwargs when required. Otherwise, `version=None` # overwrite the version parsed from the name. # `tfds.builder('my_dataset:1.2.0', version=None)` if FLAGS.experimental_latest_version: version_kwarg = {"version": "experimental_latest"} else: version_kwarg = {} logging.info("Running download_and_prepare for dataset(s):\n%s", "\n".join(datasets_to_build)) builders = { name: tfds.builder(name, data_dir=FLAGS.data_dir, **version_kwarg) for name in datasets_to_build } if FLAGS.builder_config_id is not None: # Requesting a single config of a single dataset if len(builders) > 1: raise ValueError( "--builder_config_id can only be used when building a single dataset" ) builder = builders[list(builders.keys())[0]] if not builder.BUILDER_CONFIGS: raise ValueError( "--builder_config_id can only be used with datasets with configs" ) config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id] logging.info("Running download_and_prepare for config: %s", config.name) builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config, **version_kwarg) download_and_prepare(builder_for_config) else: for name, builder in builders.items(): if builder.BUILDER_CONFIGS and "/" not in name: # If builder has multiple configs, and no particular config was # requested, then compute all. for config in builder.BUILDER_CONFIGS: builder_for_config = tfds.builder(builder.name, data_dir=FLAGS.data_dir, config=config, **version_kwarg) download_and_prepare(builder_for_config) else: # If there is a slash in the name, then user requested a specific # dataset configuration. download_and_prepare(builder)
from __future__ import division from __future__ import print_function import os import pdb import time from absl import app from absl import flags from absl import logging import tensorflow as tf import tensorflow_datasets as tfds import termcolor FLAGS = flags.FLAGS BUILDERS = ",".join(tfds.list_builders()) DEFAULT_DATA_DIR = os.path.expanduser(os.path.join("~", "tensorflow_datasets")) flags.DEFINE_string( "datasets", BUILDERS, "Comma separated list of datasets to build, defaults to all" "registered builders.") flags.DEFINE_string( "exclude_datasets", "", "Comma separated list of datasets to exclude," "(no download, no prepare).") flags.DEFINE_string("data_dir", DEFAULT_DATA_DIR, "Were to place the data.") flags.DEFINE_string("download_dir", None, "Where to place downloads.") flags.DEFINE_string("extract_dir", None, "Where to extract files.") flags.DEFINE_string(
def _all_tfds_datasets() -> List[str]: """Returns all "official" TFDS dataset names.""" return sorted([ name for name in tfds.list_builders(with_community_datasets=True) # pylint: disable=g-complex-comprehension if name not in _BUILDER_BLACKLIST ])
# %% import tensorflow as tf import tensorflow_datasets as tfds import IPython.display as display # Here we assume Eager mode is enabled (TF2), but tfds also works in Graph mode. print(tf.__version__) # %% # See available datasets print(tfds.list_builders()) # %% # Construct a tf.data.Dataset #ds_train = tfds.load(name="mnist", split="train", shuffle_files=True) # Build your input pipeline # %% ds_train = tfds.load(name="coco/2017", split="train", shuffle_files=True) # dataset = ( # ds_train # .shuffle(1000) # .batch(128) # .prefetch(10) # )
# coding=utf-8 # created by msg on 2019/12/4 2:53 下午 import tensorflow as tf import tensorflow_datasets as tfds for data in tfds.list_builders(): print(data) try: t = tfds.load(data) except Exception: continue path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
#!/usr/bin/env python # coding: utf-8 import tensorflow as tf import tensorflow_datasets as tfds # For brevity I decided to reuse this small image dataset that was available with tfds # For an example of a more extensive raw data preprocessing I point to my recent project # here: https://github.com/dsalaj/common-voice-tf IMGDS = 'aflw2k3d' assert IMGDS in tfds.list_builders( ), IMGDS + ' dataset not found in tfds! This was tested with tensorflow-datasets-2.1.0' ds_builder = tfds.builder(IMGDS) ds_builder.download_and_prepare() ds_raw = ds_builder.as_dataset(split='train') # import matplotlib.pyplot as plt # # get_ipython().run_line_magic('matplotlib', 'inline') _, ds_info = tfds.load(IMGDS, with_info=True) num_examples = ds_info.splits['train'].num_examples # # Plot samples from the dataset # fig = tfds.show_examples(ds_info, ds_raw) def extract_images(features): return features['image'] # (450, 450, 3) ds = ds_raw.map(extract_images).shuffle(num_examples)
def refactor_datasets() -> None: """Refactoring all dataset into one folder.""" for ds_name in tfds.list_builders(): refactor_dataset(ds_name)
def in_tfds(dataset_name: str): all_datasets = tfds.list_builders() name_without_params = dataset_name.split("/")[0] return name_without_params in all_datasets