Python load示例，tensorflow_datasets.public_api.load Python示例

示例#1

0

显示文件

文件： evaluate_story_cloze_test.py 项目： MitchellTesla/google-research

def prepare_dataset(dataset_name=gin.REQUIRED,
                    shuffle_input_sentences=False,
                    num_eval_examples=2000,
                    batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """

    del batch_size
    del num_eval_examples
    del shuffle_input_sentences

    dataset = tfds.load(dataset_name,
                        data_dir=FLAGS.data_dir,
                        split=rocstories_sentence_embeddings.TEST_2016,
                        download=False)
    dataset = utils.build_validation_dataset(dataset)
    return dataset

示例#2

0

显示文件

def prepare_datasets(dataset_name=gin.REQUIRED,
                     shuffle_input_sentences=False,
                     num_eval_examples=2000,
                     batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """
    del shuffle_input_sentences

    splits_to_load = {
        'valid_nolabel': 'train[:2%]',
        'train_nolabel': 'train[2%:4%]',
        'valid2018': rocstories_sentence_embeddings.VALIDATION_2018,
        'valid2016': rocstories_sentence_embeddings.VALIDATION_2016
    }

    datasets = tfds.load(dataset_name,
                         data_dir=FLAGS.data_dir,
                         split=splits_to_load,
                         download=False)

    emb_matrices = {}

    valid_nolabel_ds = utils.build_train_style_dataset(
        datasets['valid_nolabel'],
        batch_size,
        False,
        num_examples=num_eval_examples,
        is_training=False)
    datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds

    train_nolabel_ds = utils.build_train_style_dataset(
        datasets['train_nolabel'],
        batch_size,
        False,
        num_examples=num_eval_examples,
        is_training=False)
    datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds

    # Convert official evaluation datasets to validation data format. There are no
    # embedding matrices involved here since the task has only two possible next
    # sentences to pick between for each example. Ignore num_eval_examples and use
    # the full datasets for these.
    datasets['valid2018'] = utils.build_validation_dataset(
        datasets['valid2018'])
    datasets['valid2016'] = utils.build_validation_dataset(
        datasets['valid2016'])

    return datasets, emb_matrices

示例#3

0

显示文件

文件： Seq2SeqSummarizer_Training.py 项目： KinoriSR/Text-Summarization-Project

def load_data(data):
    raw = tfds.load(data)
    train_raw = raw['train']
    test_raw = raw['test']
    validate_raw = raw['validation']

    # list of dictionaries of data [{'email': email, 'subject': subject},{}]
    train = create_dict(train_raw)
    test = create_dict(test_raw)
    validate = create_dict(validate_raw)
    return train, test, validate

示例#4

0

显示文件

def get_open_shelf_dataset():
    dl_config = tfds.download.DownloadConfig(
        manual_dir='/home/vilon_tao/tensorflow_datasets/downloads/manual',
        download_mode=tfds.GenerateMode.REUSE_DATASET_IF_EXISTS)

    train_ds, test_ds, val_ds = tfds.load(
        name='my_dataset',
        data_dir='/home/taolongming/tensorflow_datasets',
        split=["train", "test", "val"],
        download=False,
        builder_kwargs=dict(dataset_name='open.shelf.classfication'),
        download_and_prepare_kwargs=dict(download_config=dl_config),
    )

    return train_ds, test_ds, val_ds

示例#5

0

显示文件

def main(_):
    builder_kwargs = {
        "validation_split": flags.validation_split
    }

    tfdataset_path = local_settings.TF_DATASET_PATH
    if flags.tfds_path is not None:
        tfdataset_path = flags.tfds_path

    train, dsinfo = tfds.load("pacs", 
        data_dir=tfdataset_path, split=tfds.Split.VALIDATION,
        builder_kwargs=builder_kwargs, with_info=True)

    for example in dataset_utils.as_numpy(train):
        import pdb; pdb.set_trace()
        print(example["attributes"]["label"])

示例#6

0

显示文件

文件： evaluate_qualitative.py 项目： sarthakksu/covid-low-income-bam

def prepare_dataset(dataset_name=gin.REQUIRED,
                    shuffle_input_sentences=False,
                    num_eval_examples=2000,
                    batch_size=32):
    """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: Not used during evaluation, but arg still needed
      for gin compatibility.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    The validation dataset, the story identifiers for each story in the
      embedding matrix, and the embedding matrix.
  """

    del num_eval_examples
    del shuffle_input_sentences

    splits_to_load = [
        tfds.Split.TRAIN,
        rocstories_sentence_embeddings.VALIDATION_2018,
    ]
    tfds_train, tfds_valid = tfds.load(dataset_name,
                                       data_dir=FLAGS.data_dir,
                                       split=splits_to_load)

    _, train_embs, train_story_ids = utils.build_train_style_dataset(
        tfds_train,
        batch_size,
        shuffle_input_sentences=False,
        return_ids=True,
        is_training=False)
    out = build_all_distractor_valid_dataset(tfds_valid, batch_size=batch_size)
    valid_dataset, valid_embs, valid_story_ids = out

    all_story_ids = valid_story_ids + train_story_ids
    all_emb_matrix = tf.concat([valid_embs, train_embs], axis=0)

    return valid_dataset, all_story_ids, all_emb_matrix

示例#7

0

显示文件

文件： waymo_open_dataset.py 项目： zdz1130/datasets

month = {June},
year = {2020}
}
"""

_DESCRIPTION = """\
The Waymo Open Dataset is comprised of high resolution sensor data
collected by Waymo self-driving cars in a wide variety of conditions.
This data is licensed for non-commercial use.

WARNING: this dataset requires additional authorization and registration.
Please look at tfds documentation for accessing GCS, and
afterwards, please register via https://waymo.com/open/licensing/
"""

_GCS_DESCRIPTION = """
This dataset is also available in pre-processed format, making it faster
to load, if you select the correct data_dir:

```
tfds.load('waymo_open_dataset/{}', \
data_dir='gs://waymo_open_dataset_{}_individual_files/tensorflow_datasets')
```

"""

_HOMEPAGE_URL = "http://www.waymo.com/open/"
_OBJECT_LABELS = [
    "TYPE_UNKNOWN", "TYPE_VEHICLE", "TYPE_PEDESTRIAN", "TYPE_SIGN",
    "TYPE_CYCLIST"
]

示例#8

0

显示文件

文件： train.py 项目： sarthakksu/covid-low-income-bam

def prepare_datasets(dataset_name=gin.REQUIRED,
                     shuffle_input_sentences=False,
                     num_eval_examples=2000,
                     batch_size=32):
  """Create batched, properly-formatted datasets from the TFDS datasets.

  Args:
    dataset_name: Name of TFDS dataset.
    shuffle_input_sentences: If True, the order of the input sentences is
      randomized.
    num_eval_examples: Number of examples to use during evaluation. For the
      nolabel evaluation, this is also the number of distractors we choose
      between.
    batch_size: Batch size.

  Returns:
    A dictionary mapping from the dataset split to a Dataset object.
  """
  splits_to_load = {
      'valid_nolabel': 'train[:2%]',
      'train': 'train[2%:]',
      'train_nolabel': 'train[2%:4%]',
      'valid2018': rocstories_sentence_embeddings.VALIDATION_2018,
      'valid2016': rocstories_sentence_embeddings.VALIDATION_2016}

  datasets = tfds.load(
      dataset_name,
      data_dir=FLAGS.data_dir,
      split=splits_to_load,
      download=False)

  emb_matrices = {}
  # Convert datasets to expected training data format, and build of the
  # embedding matrices.
  train_ds = utils.build_train_style_dataset(
      datasets['train'], batch_size, shuffle_input_sentences)
  datasets['train'], emb_matrices['train'] = train_ds

  valid_nolabel_ds = utils.build_train_style_dataset(
      datasets['valid_nolabel'], batch_size, False,
      num_examples=num_eval_examples)
  datasets['valid_nolabel'], emb_matrices['valid_nolabel'] = valid_nolabel_ds

  train_nolabel_ds = utils.build_train_style_dataset(
      datasets['train_nolabel'], batch_size, False,
      num_examples=num_eval_examples)
  datasets['train_nolabel'], emb_matrices['train_nolabel'] = train_nolabel_ds

  # Convert official evaluation datasets to validation data format. There are no
  # embedding matrices involved here since the task has only two possible next
  # sentences to pick between for each example.
  datasets['valid2018'] = utils.build_validation_dataset(
      datasets['valid2018']).take(num_eval_examples)
  datasets['valid2016'] = utils.build_validation_dataset(
      datasets['valid2016']).take(num_eval_examples)

  logging.info('EMBEDDING MATRICES CREATED:')
  for key in emb_matrices:
    logging.info('%s: %s', key, emb_matrices[key].shape)

  return datasets, emb_matrices

示例#9

0

显示文件

文件： waymo_open_dataset.py 项目： sezan92/datasets-1

import io
import os
from absl import logging
import tensorflow.compat.v2 as tf
from tensorflow_datasets.proto import waymo_dataset_pb2 as open_dataset
import tensorflow_datasets.public_api as tfds

_CITATION = """
@misc{waymo_open_dataset,
  title = {Waymo Open Dataset: An autonomous driving dataset},
  website = {url{https://www.waymo.com/open}},
  year = {2020}
}
"""

_DESCRIPTION = """\
The Waymo Open Dataset is comprised of high resolution sensor data
collected by Waymo self-driving cars in a wide variety of conditions.
This data is licensed for non-commercial use.

WARNING: this dataset requires additional authorization and registration.
Please look at tfds documentation for accessing GCS, and
afterwards, please register via https://waymo.com/open/licensing/

This dataset is also available in pre-processed format, making it faster
to load, if you select the correct data_dir:
tfds.load('waymo_open_dataset', \
data_dir='gs://waymo_open_dataset_v_1_0_0_individual_files/tensorflow_datasets')
"""

_HOMEPAGE_URL = "http://www.waymo.com/open/"

示例#10

0

显示文件

def set_data(type):
    if (type == 'shakespeare'):
        g_vars.df = tfds.load(name='tiny_shakespeare')['train']
    else:
        g_vars.df = tfds.load(name='imdb_reviews', split='train[:5%]')
    set_vocab()

示例#11

0

显示文件

文件： miniShakespeare.py 项目： KozecM/Shakespear_app

import tensorflow as tf
import tensorflow_datasets.public_api as tfds
from tqdm import tqdm
import numpy as np
import os

df = tfds.load(name='tiny_shakespeare')['train']
df = tfds.load(name="imdb_reviews", split='train[:5%]')
df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
iter_df = iter(df)
vocabulary = set([])

for review in iter_df:
    temp_vocab = sorted(set(tfds.as_numpy(review)))
    vocabulary.update(temp_vocab)

vocabulary = sorted(vocabulary)
char2idx = {u: i for i, u in enumerate(vocabulary)}
idx2char = np.array(vocabulary)

model = tf.keras.models.load_model('models/imdb')


def generate_text(model, start_string, generation_length=2000):
    # Evaluation step (generating ABC text using the learned RNN model)

    input_eval = [char2idx[(bytes(i, encoding='utf8'))] for i in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

示例#12

0

显示文件

文件： usps.py 项目： geronsushi/dage

    def _generate_examples(self, data_path: str):
        """Generate examples as dicts.
        Args:
        filepath: `str` path of the file to process.
        Yields:
        Generator yielding the next samples
        """

        # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2"
        with tf.io.gfile.GFile(data_path, "rb") as f:  # type: ignore
            data = tfds.core.lazy_imports.scipy.io.loadmat(f)['data']

        # data dimensions are [256, 1100, 10], i.e. [16x16, n_examples, n_classes]
        for i, (example_num, label) in enumerate(
                itertools.product(range(data.shape[1]), range(data.shape[2]))):
            image = np.swapaxes(
                data[:, example_num, label].reshape(SHAPE),
                0,
                1,
            )
            record = {
                "image": image,
                "label": (label + 1) % 10,  #
            }
            yield i, record


if __name__ == "__main__":
    ds, info = tfds.load("usps", split="train", with_info=True)
    print(info)

示例#13

0

显示文件

文件： shakespeare_ml.py 项目： KozecM/Shakespear_app

import pandas as pd
import numpy as np
import os
import unicodedata
import time
import functools
from six.moves import urllib
from IPython import display as ipythondisplay
from tqdm import tqdm
# download and import the MIT 6.S191 package
import mitdeeplearning as mdl
print(tf.config.list_physical_devices())
assert len(tf.config.list_physical_devices('GPU')) > 0

# load your dataset
df = tfds.load(name="imdb_reviews", split='train[:5%]')

# df = tfds.load(name='tiny_shakespearee')['train']

df = df.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
iter_df = iter(df)
vocabulary = set([])

for review in iter_df:
    temp_vocab = sorted(set(tfds.as_numpy(review)))
    vocabulary.update(temp_vocab)

vocabulary = sorted(vocabulary)
# vocabulary = sorted(set(next(iter(tfds.as_numpy(df)))))

shakespeare = df.map(lambda x: {'cur_char': x[:-1], 'next_char': x[1:]})

示例#14

0

显示文件

文件： mnist_m.py 项目： geronsushi/dage

        ]

    def _generate_examples(self, images_dir_path: str, labels_path: str):
        """Generate examples as dicts.
        Args:
        filepath: `str` path of the file to process.
        Yields:
        Generator yielding the next samples
        """

        # the labels file consists of lines of image-names and label pairs, e.g. "00000001.png 2"
        with tf.io.gfile.GFile(labels_path, "rb") as f:  # type: ignore
            lines = list(map(lambda l: str(l, "utf-8").split(), f.readlines()))

        for i, (image_name, label) in enumerate(lines):
            image_path = os.path.join(images_dir_path, image_name)
            image = np.array(Image.open(image_path))
            record = {
                "image": image,
                "label": label,
            }
            yield i, record


if __name__ == "__main__":
    # tf.compat.v1.enable_eager_execution()
    mnist_m_ds, mnist_m_info = tfds.load("mnist_m",
                                         split="train",
                                         with_info=True)
    print(mnist_m_info)