def prepare_dataset(dataset_name=gin.REQUIRED, shuffle_input_sentences=False, num_eval_examples=2000, batch_size=32): """Create batched, properly-formatted datasets from the TFDS datasets. Args: dataset_name: Name of TFDS dataset. shuffle_input_sentences: Not used during evaluation, but arg still needed for gin compatibility. num_eval_examples: Number of examples to use during evaluation. For the nolabel evaluation, this is also the number of distractors we choose between. batch_size: Batch size. Returns: A dictionary mapping from the dataset split to a Dataset object. """ del batch_size del num_eval_examples del shuffle_input_sentences dataset = tfds.load(dataset_name, data_dir=FLAGS.data_dir, split=rocstories_sentence_embeddings.TEST_2016, download=False) dataset = utils.build_validation_dataset(dataset) return dataset
def prepare_datasets(dataset_name=gin.REQUIRED, shuffle_input_sentences=False, num_eval_examples=2000, batch_size=32): """Create batched, properly-formatted datasets from the TFDS datasets. Args: dataset_name: Name of TFDS dataset. shuffle_input_sentences: Not used during evaluation, but arg still needed for gin compatibility. num_eval_examples: Number of examples to use during evaluation. For the nolabel evaluation, this is also the number of distractors we choose between. batch_size: Batch size. Returns: A dictionary mapping from the dataset split to a Dataset object. """ del shuffle_input_sentences splits_to_load = { 'valid_nolabel': 'train[:2%]', 'train_nolabel': 'train[2%:4%]', 'valid2018': rocstories_sentence_embeddings.VALIDATION_2018, 'valid2016': rocstories_sentence_embeddings.VALIDATION_2016 } datasets = tfds.load(dataset_name, data_dir=FLAGS.data_dir, split=splits_to_load, download=False) emb_matrices = {} valid_nolabel_ds = utils.build_train_style_dataset( datasets['valid_nolabel'], batch_size, False, num_examples=num_eval_examples, is_training=False) datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds train_nolabel_ds = utils.build_train_style_dataset( datasets['train_nolabel'], batch_size, False, num_examples=num_eval_examples, is_training=False) datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds # Convert official evaluation datasets to validation data format. There are no # embedding matrices involved here since the task has only two possible next # sentences to pick between for each example. Ignore num_eval_examples and use # the full datasets for these. datasets['valid2018'] = utils.build_validation_dataset( datasets['valid2018']) datasets['valid2016'] = utils.build_validation_dataset( datasets['valid2016']) return datasets, emb_matrices
def load_data(data): raw = tfds.load(data) train_raw = raw['train'] test_raw = raw['test'] validate_raw = raw['validation'] # list of dictionaries of data [{'email': email, 'subject': subject},{}] train = create_dict(train_raw) test = create_dict(test_raw) validate = create_dict(validate_raw) return train, test, validate
def get_open_shelf_dataset(): dl_config = tfds.download.DownloadConfig( manual_dir='/home/vilon_tao/tensorflow_datasets/downloads/manual', download_mode=tfds.GenerateMode.REUSE_DATASET_IF_EXISTS) train_ds, test_ds, val_ds = tfds.load( name='my_dataset', data_dir='/home/taolongming/tensorflow_datasets', split=["train", "test", "val"], download=False, builder_kwargs=dict(dataset_name='open.shelf.classfication'), download_and_prepare_kwargs=dict(download_config=dl_config), ) return train_ds, test_ds, val_ds
def main(_): builder_kwargs = { "validation_split": flags.validation_split } tfdataset_path = local_settings.TF_DATASET_PATH if flags.tfds_path is not None: tfdataset_path = flags.tfds_path train, dsinfo = tfds.load("pacs", data_dir=tfdataset_path, split=tfds.Split.VALIDATION, builder_kwargs=builder_kwargs, with_info=True) for example in dataset_utils.as_numpy(train): import pdb; pdb.set_trace() print(example["attributes"]["label"])
def prepare_dataset(dataset_name=gin.REQUIRED, shuffle_input_sentences=False, num_eval_examples=2000, batch_size=32): """Create batched, properly-formatted datasets from the TFDS datasets. Args: dataset_name: Name of TFDS dataset. shuffle_input_sentences: Not used during evaluation, but arg still needed for gin compatibility. num_eval_examples: Number of examples to use during evaluation. For the nolabel evaluation, this is also the number of distractors we choose between. batch_size: Batch size. Returns: The validation dataset, the story identifiers for each story in the embedding matrix, and the embedding matrix. """ del num_eval_examples del shuffle_input_sentences splits_to_load = [ tfds.Split.TRAIN, rocstories_sentence_embeddings.VALIDATION_2018, ] tfds_train, tfds_valid = tfds.load(dataset_name, data_dir=FLAGS.data_dir, split=splits_to_load) _, train_embs, train_story_ids = utils.build_train_style_dataset( tfds_train, batch_size, shuffle_input_sentences=False, return_ids=True, is_training=False) out = build_all_distractor_valid_dataset(tfds_valid, batch_size=batch_size) valid_dataset, valid_embs, valid_story_ids = out all_story_ids = valid_story_ids + train_story_ids all_emb_matrix = tf.concat([valid_embs, train_embs], axis=0) return valid_dataset, all_story_ids, all_emb_matrix
month = {June}, year = {2020} } """ _DESCRIPTION = """\ The Waymo Open Dataset is comprised of high resolution sensor data collected by Waymo self-driving cars in a wide variety of conditions. This data is licensed for non-commercial use. WARNING: this dataset requires additional authorization and registration. Please look at tfds documentation for accessing GCS, and afterwards, please register via https://waymo.com/open/licensing/ """ _GCS_DESCRIPTION = """ This dataset is also available in pre-processed format, making it faster to load, if you select the correct data_dir: ``` tfds.load('waymo_open_dataset/{}', \ data_dir='gs://waymo_open_dataset_{}_individual_files/tensorflow_datasets') ``` """ _HOMEPAGE_URL = "http://www.waymo.com/open/" _OBJECT_LABELS = [ "TYPE_UNKNOWN", "TYPE_VEHICLE", "TYPE_PEDESTRIAN", "TYPE_SIGN", "TYPE_CYCLIST" ]
def prepare_datasets(dataset_name=gin.REQUIRED, shuffle_input_sentences=False, num_eval_examples=2000, batch_size=32): """Create batched, properly-formatted datasets from the TFDS datasets. Args: dataset_name: Name of TFDS dataset. shuffle_input_sentences: If True, the order of the input sentences is randomized. num_eval_examples: Number of examples to use during evaluation. For the nolabel evaluation, this is also the number of distractors we choose between. batch_size: Batch size. Returns: A dictionary mapping from the dataset split to a Dataset object. """ splits_to_load = { 'valid_nolabel': 'train[:2%]', 'train': 'train[2%:]', 'train_nolabel': 'train[2%:4%]', 'valid2018': rocstories_sentence_embeddings.VALIDATION_2018, 'valid2016': rocstories_sentence_embeddings.VALIDATION_2016} datasets = tfds.load( dataset_name, data_dir=FLAGS.data_dir, split=splits_to_load, download=False) emb_matrices = {} # Convert datasets to expected training data format, and build of the # embedding matrices. train_ds = utils.build_train_style_dataset( datasets['train'], batch_size, shuffle_input_sentences) datasets['train'], emb_matrices['train'] = train_ds valid_nolabel_ds = utils.build_train_style_dataset( datasets['valid_nolabel'], batch_size, False, num_examples=num_eval_examples) datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds train_nolabel_ds = utils.build_train_style_dataset( datasets['train_nolabel'], batch_size, False, num_examples=num_eval_examples) datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds # Convert official evaluation datasets to validation data format. There are no # embedding matrices involved here since the task has only two possible next # sentences to pick between for each example. datasets['valid2018'] = utils.build_validation_dataset( datasets['valid2018']).take(num_eval_examples) datasets['valid2016'] = utils.build_validation_dataset( datasets['valid2016']).take(num_eval_examples) logging.info('EMBEDDING MATRICES CREATED:') for key in emb_matrices: logging.info('%s: %s', key, emb_matrices[key].shape) return datasets, emb_matrices
import io import os from absl import logging import tensorflow.compat.v2 as tf from tensorflow_datasets.proto import waymo_dataset_pb2 as open_dataset import tensorflow_datasets.public_api as tfds _CITATION = """ @misc{waymo_open_dataset, title = {Waymo Open Dataset: An autonomous driving dataset}, website = {url{https://www.waymo.com/open}}, year = {2020} } """ _DESCRIPTION = """\ The Waymo Open Dataset is comprised of high resolution sensor data collected by Waymo self-driving cars in a wide variety of conditions. This data is licensed for non-commercial use. WARNING: this dataset requires additional authorization and registration. Please look at tfds documentation for accessing GCS, and afterwards, please register via https://waymo.com/open/licensing/ This dataset is also available in pre-processed format, making it faster to load, if you select the correct data_dir: tfds.load('waymo_open_dataset', \ data_dir='gs://waymo_open_dataset_v_1_0_0_individual_files/tensorflow_datasets') """ _HOMEPAGE_URL = "http://www.waymo.com/open/"
def set_data(type): if (type == 'shakespeare'): g_vars.df = tfds.load(name='tiny_shakespeare')['train'] else: g_vars.df = tfds.load(name='imdb_reviews', split='train[:5%]') set_vocab()
import tensorflow as tf import tensorflow_datasets.public_api as tfds from tqdm import tqdm import numpy as np import os df = tfds.load(name='tiny_shakespeare')['train'] df = tfds.load(name="imdb_reviews", split='train[:5%]') df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8')) iter_df = iter(df) vocabulary = set([]) for review in iter_df: temp_vocab = sorted(set(tfds.as_numpy(review))) vocabulary.update(temp_vocab) vocabulary = sorted(vocabulary) char2idx = {u: i for i, u in enumerate(vocabulary)} idx2char = np.array(vocabulary) model = tf.keras.models.load_model('models/imdb') def generate_text(model, start_string, generation_length=2000): # Evaluation step (generating ABC text using the learned RNN model) input_eval = [char2idx[(bytes(i, encoding='utf8'))] for i in start_string] input_eval = tf.expand_dims(input_eval, 0) # Empty string to store our results text_generated = []
def _generate_examples(self, data_path: str): """Generate examples as dicts. Args: filepath: `str` path of the file to process. Yields: Generator yielding the next samples """ # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2" with tf.io.gfile.GFile(data_path, "rb") as f: # type: ignore data = tfds.core.lazy_imports.scipy.io.loadmat(f)['data'] # data dimensions are [256, 1100, 10], i.e. [16x16, n_examples, n_classes] for i, (example_num, label) in enumerate( itertools.product(range(data.shape[1]), range(data.shape[2]))): image = np.swapaxes( data[:, example_num, label].reshape(SHAPE), 0, 1, ) record = { "image": image, "label": (label + 1) % 10, # } yield i, record if __name__ == "__main__": ds, info = tfds.load("usps", split="train", with_info=True) print(info)
import pandas as pd import numpy as np import os import unicodedata import time import functools from six.moves import urllib from IPython import display as ipythondisplay from tqdm import tqdm # download and import the MIT 6.S191 package import mitdeeplearning as mdl print(tf.config.list_physical_devices()) assert len(tf.config.list_physical_devices('GPU')) > 0 # load your dataset df = tfds.load(name="imdb_reviews", split='train[:5%]') # df = tfds.load(name='tiny_shakespearee')['train'] df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8')) iter_df = iter(df) vocabulary = set([]) for review in iter_df: temp_vocab = sorted(set(tfds.as_numpy(review))) vocabulary.update(temp_vocab) vocabulary = sorted(vocabulary) # vocabulary = sorted(set(next(iter(tfds.as_numpy(df))))) shakespeare = df.map(lambda x: {'cur_char': x[:-1], 'next_char': x[1:]})
] def _generate_examples(self, images_dir_path: str, labels_path: str): """Generate examples as dicts. Args: filepath: `str` path of the file to process. Yields: Generator yielding the next samples """ # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2" with tf.io.gfile.GFile(labels_path, "rb") as f: # type: ignore lines = list(map(lambda l: str(l, "utf-8").split(), f.readlines())) for i, (image_name, label) in enumerate(lines): image_path = os.path.join(images_dir_path, image_name) image = np.array(Image.open(image_path)) record = { "image": image, "label": label, } yield i, record if __name__ == "__main__": # tf.compat.v1.enable_eager_execution() mnist_m_ds, mnist_m_info = tfds.load("mnist_m", split="train", with_info=True) print(mnist_m_info)