def prepare_dataset(dataset_name=gin.REQUIRED,
                    shuffle_input_sentences=False,
                    num_eval_examples=2000,
                    batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """

    del batch_size
    del num_eval_examples
    del shuffle_input_sentences

    dataset = tfds.load(dataset_name,
                        data_dir=FLAGS.data_dir,
                        split=rocstories_sentence_embeddings.TEST_2016,
                        download=False)
    dataset = utils.build_validation_dataset(dataset)
    return dataset
Exemplo n.º 2
0
def prepare_datasets(dataset_name=gin.REQUIRED,
                     shuffle_input_sentences=False,
                     num_eval_examples=2000,
                     batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """
    del shuffle_input_sentences

    splits_to_load = {
        'valid_nolabel': 'train[:2%]',
        'train_nolabel': 'train[2%:4%]',
        'valid2018': rocstories_sentence_embeddings.VALIDATION_2018,
        'valid2016': rocstories_sentence_embeddings.VALIDATION_2016
    }

    datasets = tfds.load(dataset_name,
                         data_dir=FLAGS.data_dir,
                         split=splits_to_load,
                         download=False)

    emb_matrices = {}

    valid_nolabel_ds = utils.build_train_style_dataset(
        datasets['valid_nolabel'],
        batch_size,
        False,
        num_examples=num_eval_examples,
        is_training=False)
    datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds

    train_nolabel_ds = utils.build_train_style_dataset(
        datasets['train_nolabel'],
        batch_size,
        False,
        num_examples=num_eval_examples,
        is_training=False)
    datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds

    # Convert official evaluation datasets to validation data format. There are no
    # embedding matrices involved here since the task has only two possible next
    # sentences to pick between for each example. Ignore num_eval_examples and use
    # the full datasets for these.
    datasets['valid2018'] = utils.build_validation_dataset(
        datasets['valid2018'])
    datasets['valid2016'] = utils.build_validation_dataset(
        datasets['valid2016'])

    return datasets, emb_matrices
def load_data(data):
    raw = tfds.load(data)
    train_raw = raw['train']
    test_raw = raw['test']
    validate_raw = raw['validation']

    # list of dictionaries of data [{'email': email, 'subject': subject},{}]
    train = create_dict(train_raw)
    test = create_dict(test_raw)
    validate = create_dict(validate_raw)
    return train, test, validate
Exemplo n.º 4
0
def get_open_shelf_dataset():
    dl_config = tfds.download.DownloadConfig(
        manual_dir='/home/vilon_tao/tensorflow_datasets/downloads/manual',
        download_mode=tfds.GenerateMode.REUSE_DATASET_IF_EXISTS)

    train_ds, test_ds, val_ds = tfds.load(
        name='my_dataset',
        data_dir='/home/taolongming/tensorflow_datasets',
        split=["train", "test", "val"],
        download=False,
        builder_kwargs=dict(dataset_name='open.shelf.classfication'),
        download_and_prepare_kwargs=dict(download_config=dl_config),
    )

    return train_ds, test_ds, val_ds
Exemplo n.º 5
0
def main(_):
    builder_kwargs = {
        "validation_split": flags.validation_split
    }

    tfdataset_path = local_settings.TF_DATASET_PATH
    if flags.tfds_path is not None:
        tfdataset_path = flags.tfds_path

    train, dsinfo = tfds.load("pacs", 
        data_dir=tfdataset_path, split=tfds.Split.VALIDATION,
        builder_kwargs=builder_kwargs, with_info=True)

    for example in dataset_utils.as_numpy(train):
        import pdb; pdb.set_trace()
        print(example["attributes"]["label"])
def prepare_dataset(dataset_name=gin.REQUIRED,
                    shuffle_input_sentences=False,
                    num_eval_examples=2000,
                    batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    The validation dataset, the story identifiers for each story in the
      embedding matrix, and the embedding matrix.
  """

    del num_eval_examples
    del shuffle_input_sentences

    splits_to_load = [
        tfds.Split.TRAIN,
        rocstories_sentence_embeddings.VALIDATION_2018,
    ]
    tfds_train, tfds_valid = tfds.load(dataset_name,
                                       data_dir=FLAGS.data_dir,
                                       split=splits_to_load)

    _, train_embs, train_story_ids = utils.build_train_style_dataset(
        tfds_train,
        batch_size,
        shuffle_input_sentences=False,
        return_ids=True,
        is_training=False)
    out = build_all_distractor_valid_dataset(tfds_valid, batch_size=batch_size)
    valid_dataset, valid_embs, valid_story_ids = out

    all_story_ids = valid_story_ids + train_story_ids
    all_emb_matrix = tf.concat([valid_embs, train_embs], axis=0)

    return valid_dataset, all_story_ids, all_emb_matrix
Exemplo n.º 7
0
month = {June},
year = {2020}
}
"""

_DESCRIPTION = """\
The Waymo Open Dataset is comprised of high resolution sensor data
collected by Waymo self-driving cars in a wide variety of conditions.
This data is licensed for non-commercial use.

WARNING: this dataset requires additional authorization and registration.
Please look at tfds documentation for accessing GCS, and
afterwards, please register via https://waymo.com/open/licensing/
"""

_GCS_DESCRIPTION = """
This dataset is also available in pre-processed format, making it faster
to load, if you select the correct data_dir:

```
tfds.load('waymo_open_dataset/{}', \
data_dir='gs://waymo_open_dataset_{}_individual_files/tensorflow_datasets')
```

"""

_HOMEPAGE_URL = "http://www.waymo.com/open/"
_OBJECT_LABELS = [
    "TYPE_UNKNOWN", "TYPE_VEHICLE", "TYPE_PEDESTRIAN", "TYPE_SIGN",
    "TYPE_CYCLIST"
]
Exemplo n.º 8
0
def prepare_datasets(dataset_name=gin.REQUIRED,
                     shuffle_input_sentences=False,
                     num_eval_examples=2000,
                     batch_size=32):
  """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: If True, the order of the input sentences is
      randomized.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """
  splits_to_load = {
      'valid_nolabel': 'train[:2%]',
      'train': 'train[2%:]',
      'train_nolabel': 'train[2%:4%]',
      'valid2018': rocstories_sentence_embeddings.VALIDATION_2018,
      'valid2016': rocstories_sentence_embeddings.VALIDATION_2016}

  datasets = tfds.load(
      dataset_name,
      data_dir=FLAGS.data_dir,
      split=splits_to_load,
      download=False)

  emb_matrices = {}
  # Convert datasets to expected training data format, and build of the
  # embedding matrices.
  train_ds = utils.build_train_style_dataset(
      datasets['train'], batch_size, shuffle_input_sentences)
  datasets['train'], emb_matrices['train'] = train_ds

  valid_nolabel_ds = utils.build_train_style_dataset(
      datasets['valid_nolabel'], batch_size, False,
      num_examples=num_eval_examples)
  datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds

  train_nolabel_ds = utils.build_train_style_dataset(
      datasets['train_nolabel'], batch_size, False,
      num_examples=num_eval_examples)
  datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds

  # Convert official evaluation datasets to validation data format. There are no
  # embedding matrices involved here since the task has only two possible next
  # sentences to pick between for each example.
  datasets['valid2018'] = utils.build_validation_dataset(
      datasets['valid2018']).take(num_eval_examples)
  datasets['valid2016'] = utils.build_validation_dataset(
      datasets['valid2016']).take(num_eval_examples)

  logging.info('EMBEDDING MATRICES CREATED:')
  for key in emb_matrices:
    logging.info('%s: %s', key, emb_matrices[key].shape)

  return datasets, emb_matrices
Exemplo n.º 9
0
import io
import os
from absl import logging
import tensorflow.compat.v2 as tf
from tensorflow_datasets.proto import waymo_dataset_pb2 as open_dataset
import tensorflow_datasets.public_api as tfds

_CITATION = """
@misc{waymo_open_dataset,
  title = {Waymo Open Dataset: An autonomous driving dataset},
  website = {url{https://www.waymo.com/open}},
  year = {2020}
}
"""

_DESCRIPTION = """\
The Waymo Open Dataset is comprised of high resolution sensor data
collected by Waymo self-driving cars in a wide variety of conditions.
This data is licensed for non-commercial use.

WARNING: this dataset requires additional authorization and registration.
Please look at tfds documentation for accessing GCS, and
afterwards, please register via https://waymo.com/open/licensing/

This dataset is also available in pre-processed format, making it faster
to load, if you select the correct data_dir:
tfds.load('waymo_open_dataset', \
data_dir='gs://waymo_open_dataset_v_1_0_0_individual_files/tensorflow_datasets')
"""

_HOMEPAGE_URL = "http://www.waymo.com/open/"
Exemplo n.º 10
0
def set_data(type):
    if (type == 'shakespeare'):
        g_vars.df = tfds.load(name='tiny_shakespeare')['train']
    else:
        g_vars.df = tfds.load(name='imdb_reviews', split='train[:5%]')
    set_vocab()
Exemplo n.º 11
0
import tensorflow as tf
import tensorflow_datasets.public_api as tfds
from tqdm import tqdm
import numpy as np
import os

df = tfds.load(name='tiny_shakespeare')['train']
df = tfds.load(name="imdb_reviews", split='train[:5%]')
df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
iter_df = iter(df)
vocabulary = set([])

for review in iter_df:
    temp_vocab = sorted(set(tfds.as_numpy(review)))
    vocabulary.update(temp_vocab)

vocabulary = sorted(vocabulary)
char2idx = {u: i for i, u in enumerate(vocabulary)}
idx2char = np.array(vocabulary)

model = tf.keras.models.load_model('models/imdb')


def generate_text(model, start_string, generation_length=2000):
    # Evaluation step (generating ABC text using the learned RNN model)

    input_eval = [char2idx[(bytes(i, encoding='utf8'))] for i in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []
Exemplo n.º 12
0
    def _generate_examples(self, data_path: str):
        """Generate examples as dicts.
        Args:
        filepath: `str` path of the file to process.
        Yields:
        Generator yielding the next samples
        """

        # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2"
        with tf.io.gfile.GFile(data_path, "rb") as f:  # type: ignore
            data = tfds.core.lazy_imports.scipy.io.loadmat(f)['data']

        # data dimensions are [256, 1100, 10], i.e. [16x16, n_examples, n_classes]
        for i, (example_num, label) in enumerate(
                itertools.product(range(data.shape[1]), range(data.shape[2]))):
            image = np.swapaxes(
                data[:, example_num, label].reshape(SHAPE),
                0,
                1,
            )
            record = {
                "image": image,
                "label": (label + 1) % 10,  #
            }
            yield i, record


if __name__ == "__main__":
    ds, info = tfds.load("usps", split="train", with_info=True)
    print(info)
Exemplo n.º 13
0
import pandas as pd
import numpy as np
import os
import unicodedata
import time
import functools
from six.moves import urllib
from IPython import display as ipythondisplay
from tqdm import tqdm
# download and import the MIT 6.S191 package
import mitdeeplearning as mdl
print(tf.config.list_physical_devices())
assert len(tf.config.list_physical_devices('GPU')) > 0

# load your dataset
df = tfds.load(name="imdb_reviews", split='train[:5%]')

# df = tfds.load(name='tiny_shakespearee')['train']

df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
iter_df = iter(df)
vocabulary = set([])

for review in iter_df:
    temp_vocab = sorted(set(tfds.as_numpy(review)))
    vocabulary.update(temp_vocab)

vocabulary = sorted(vocabulary)
# vocabulary = sorted(set(next(iter(tfds.as_numpy(df)))))

shakespeare = df.map(lambda x: {'cur_char': x[:-1], 'next_char': x[1:]})
Exemplo n.º 14
0
        ]

    def _generate_examples(self, images_dir_path: str, labels_path: str):
        """Generate examples as dicts.
        Args:
        filepath: `str` path of the file to process.
        Yields:
        Generator yielding the next samples
        """

        # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2"
        with tf.io.gfile.GFile(labels_path, "rb") as f:  # type: ignore
            lines = list(map(lambda l: str(l, "utf-8").split(), f.readlines()))

        for i, (image_name, label) in enumerate(lines):
            image_path = os.path.join(images_dir_path, image_name)
            image = np.array(Image.open(image_path))
            record = {
                "image": image,
                "label": label,
            }
            yield i, record


if __name__ == "__main__":
    # tf.compat.v1.enable_eager_execution()
    mnist_m_ds, mnist_m_info = tfds.load("mnist_m",
                                         split="train",
                                         with_info=True)
    print(mnist_m_info)