Пример #1
0
def create_dataset(buffer_size, batch_size, data_format, data_dir=None):
  """Creates a tf.data Dataset.

  Args:
    buffer_size: Shuffle buffer size.
    batch_size: Batch size
    data_format: channels_first or channels_last
    data_dir: directory to store the dataset.

  Returns:
    train dataset, test dataset, metadata
  """

  preprocess_train = Preprocess(data_format, train=True)
  preprocess_test = Preprocess(data_format, train=False)

  dataset, metadata = tfds.load(
      'cifar10', data_dir=data_dir, as_supervised=True, with_info=True)
  train_dataset, test_dataset = dataset['train'], dataset['test']

  train_dataset = train_dataset.map(
      preprocess_train, num_parallel_calls=AUTOTUNE)
  train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
  train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)

  test_dataset = test_dataset.map(
      preprocess_test, num_parallel_calls=AUTOTUNE).batch(batch_size)
  test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)

  return train_dataset, test_dataset, metadata
Пример #2
0
def create_dataset(buffer_size, batch_size):
  dataset, _ = tfds.load('mnist', as_supervised=True, with_info=True)
  train_dataset, _ = dataset['train'], dataset['test']
  train_dataset = train_dataset.map(scale, num_parallel_calls=AUTOTUNE)
  train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)

  return train_dataset
Пример #3
0
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import sys

cnn_model_name = 'cnn_text_classification.h5'
rnn_model_name = 'rnn_text_classification.h5'
print(tf.executing_eagerly())
max_features = 10000
max_len = 200
initial_epochs = 10
validation_steps = 20

print('loading data...')
data, info = tfds.load(name="imdb_reviews/subwords8k",
                       with_info=True,
                       as_supervised=True, )

test_dataset = data['test']
train_dataset = data['train']
print(train_dataset)
sys.exit()
encoder = info.features['text'].encoder
print('Vocabulary size: {}'.format(encoder.vocab_size))
# imdb_builder = tfds.builder(name="imdb_reviews/subwords8k")
# imdb_builder.download_and_prepare()
# info = imdb_builder.info
# print("dataset name {} \ndataset size: {}\ndataset features: {}".format(info.name, info.splits, info.features))
# test_dataset = imdb_builder.as_dataset(split="test")
# train_dataset = imdb_builder.as_dataset(split="train")
# for train_example in train_dataset.take(1):
Пример #4
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #5
0
from lib.networks.segmentation import MobileDeepLabV3
from lib.networks.style_transfer import build_model
from lib.networks.style_transfer.layers import TVLoss
from lib.visualize import vis_segmentation

if __name__ == '__main__':
    INPUT_WIDTH = 512
    INPUT_HEIGHT = 256

    cityscapes = tfds.load(
        'cityscapes/semantic_segmentation',
        split='train[:2975]',
        shuffle_files=True).map(
            lambda d: {
                **d, 'segmentation_label':
                d['segmentation_label'] / 255,
                'image_left':
                tf.image.resize_with_pad(d['image_left'] / 255,
                                         target_height=INPUT_HEIGHT,
                                         target_width=INPUT_WIDTH)
            }).batch(4)
    wikiart = tfds.load(
        'wikiart_images', split='train[:2975]', shuffle_files=True).map(
            lambda d: {
                **d, 'image':
                tf.image.resize_with_crop_or_pad(d['image'] / 255,
                                                 target_height=INPUT_HEIGHT,
                                                 target_width=INPUT_WIDTH)
            }).batch(4)

    exC, exW = next(zip(iter(cityscapes), iter(wikiart)))
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds


def normalize_img(image, label):
    """Normalizes images: `uint8` -> `float32`."""
    return tf.cast(image, tf.float32) / 255.0, label


if __name__ == "__main__":
    tf.enable_v2_behavior()

    (ds_train, ds_test), ds_info = tfds.load(
        "mnist",
        split=["train", "test"],
        shuffle_files=True,
        as_supervised=True,
        with_info=True,
    )

    ds_train = ds_train.map(normalize_img,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.cache()
    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
    ds_train = ds_train.batch(128)
    ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

    ds_test = ds_test.map(normalize_img,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.batch(128)
    ds_test = ds_test.cache()
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds


# Construct a tf.daatta.Dataset
ds = tfds.load('mnist', split='train', shuffle_files=True)

# Build your input pipeline
ds = ds.shuffle(1024).batch(32).prefetch(tf.data.experimental.AUTOTUNE)
for example in ds.take(1):
  image, label = example["image"], example["label"]

  print("label:",label)
    print("GPU device not found, use cpu instead!")
    # raise SystemError('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
config.log_device_placement = False

# Step 1: Load dataset from 102 category flower dataset
with tf.Session(config=config) as sess:
    # Use cifar100, which has 100 categories with size 32*32
    # Preproceess the images and set the hyperparameters
    cifar100_train, cifar100_info = tfds.load(name="cifar100",
                                              split=tfds.Split.TRAIN,
                                              as_supervised=True,
                                              with_info=True)
    BATCH_SIZE = 128
    EPOCH = 7
    INPUT_SIZE = cifar100_info.splits["train"].num_examples
    BUFFER_SIZE = 8000
    NUM_CLASSES = cifar100_info.features['label'].num_classes
    iter_number = (int)(INPUT_SIZE / BATCH_SIZE) + 1
    train_ds = utils.prepare_train_ds(cifar100_train,
                                      BATCH_SIZE,
                                      BUFFER_SIZE,
                                      image_size=224)

    # Use third party images with 102 categories flowers.
    # BATCH_SIZE = 128
    # EPOCH = 7
Пример #9
0
# limitations under the License.
# ==============================================================================
"""A demo script to show to train a segmentation model."""

from keras.efficientdet_keras import EfficientDetNet
import tensorflow as tf


def create_mask(pred_mask):
  pred_mask = tf.argmax(pred_mask, axis=-1)
  pred_mask = pred_mask[..., tf.newaxis]
  return pred_mask[0]


import tensorflow_datasets as tfds
dataset, info = tfds.load('oxford_iiit_pet:3.*.*', with_info=True)


def normalize(input_image, input_mask):
  input_image = tf.cast(input_image, tf.float32) / 255.0
  input_mask -= 1
  return input_image, input_mask


def load_image_train(datapoint):
  input_image = tf.image.resize(datapoint['image'], (512, 512))
  input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128))

  if tf.random.uniform(()) > 0.5:
    input_image = tf.image.flip_left_right(input_image)
    input_mask = tf.image.flip_left_right(input_mask)
from __future__ import absolute_import, division, print_function, unicode_literals


import tensorflow_datasets as tfds
import tensorflow as tf
print(tf.__version__)

# Get the data
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

tokenizer = info.features['text'].encoder

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
Пример #11
0
def get_dataset(data_dir, config, dataset_name=None):
  """The training dataset for the code model for fault localization.

  Args:
    data_dir: The data directory to use with tfds.load.
    config: The config for the model.
    dataset_name: If set, use this dataset name in place of the one from the
      config.
  Returns:
    train_dataset: The tf.data.Dataset with batched examples.
    info: The DatasetInfo object containing the feature connectors and other
      info about the dataset.
  """
  dataset_name = dataset_name or config.dataset.name
  split = get_split(config)
  version = (
      None if config.dataset.version == 'default' else config.dataset.version)

  # If in interact mode, use an interactive dataset.
  if config.runner.mode == 'interact':
    dbuilder = tfds.builder(
        dataset_name, data_dir=data_dir, version=version)
    unused_split_generators = dbuilder._split_generators(dl_manager=None)  # pylint: disable=protected-access
    info = dbuilder.info
    info._builder.set_representation(config.dataset.representation)  # pylint: disable=protected-access
    assert config.dataset.batch_size == 1
    dataset = make_interactive_dataset(info, config)
    if config.dataset.batch:
      dataset = apply_batching(dataset, info, config)
    set_task = cannot_set_task
    return DatasetInfo(
        dataset=dataset,
        info=info,
        set_task=set_task
    )

  # Load the dataset.
  if config.dataset.in_memory:
    dbuilder = tfds.builder(
        dataset_name, data_dir=data_dir, version=version)
    unused_split_generators = dbuilder._split_generators(dl_manager=None)  # pylint: disable=protected-access
    dataset, set_task = dbuilder.as_in_memory_dataset(split='all')
    info = dbuilder.info
  else:
    name = dataset_name
    if version is not None:
      name = f'{name}:{version}'
    dataset, info = tfds.load(
        name=name, split=split,
        data_dir=data_dir,
        # batch_size=config.dataset.batch_size,
        with_info=True)
    set_task = cannot_set_task

  info._builder.set_representation(config.dataset.representation)  # pylint: disable=protected-access

  verify_reasonable_dataset(dataset_name, info, config)
  dataset = dataset.repeat()
  dataset = apply_filtering(dataset, info, config)
  if config.dataset.batch:
    dataset = apply_batching(dataset, info, config)
  return DatasetInfo(
      dataset=dataset,
      info=info,
      set_task=set_task,
  )
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 22 18:03:44 2020

@author: jjg
"""

import tensorflow as tf
import tensorflow_datasets
from transformers import *

# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'],
                                                  tokenizer,
                                                  max_length=128,
                                                  task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'],
                                                  tokenizer,
                                                  max_length=128,
                                                  task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,
Пример #13
0
def create_model():
    dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
    train_dataset, test_dataset = dataset['train'], dataset['test']

    class_names = ['T-shirst/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    num_train_example = metadata.splits['train'].num_examples
    num_test_example = metadata.splits['test'].num_examples
    print(num_train_example)
    print(num_test_example)

    # train_dataset = train_dataset.map(normalize)
    # test_dataset = test_dataset.map(normalize)

    # take 1 image and remove the color dimension by reshaping
    for image, label in test_dataset.take(1):
        break
    image = image.numpy().reshape((28, 28))

    # # plot the image
    # plt.figure()
    # plt.imshow(image, cmap=plt.cm.binary)
    # plt.colorbar()
    # plt.grid(False)
    # plt.show()
    #
    # # plot 25 image
    # plt.figure(figsize=(10, 10))
    # i = 0
    # for (image, label) in test_dataset.take(25):
    #     image = image.numpy().reshape((28, 28))
    #     plt.subplot(5, 5, i + 1)
    #     plt.xticks([])
    #     plt.yticks([])
    #     plt.grid(False)
    #     plt.imshow(image, cmap=plt.cm.binary)
    #     plt.xlabel(class_names[label])
    #     i += 1
    # plt.show()

    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dense(10, activation=tf.nn.softmax)
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    BATCH_SIZE = 32
    train_dataset = train_dataset.repeat().shuffle(num_train_example).batch(BATCH_SIZE)
    test_dataset = test_dataset.batch(BATCH_SIZE)

    model.fit(train_dataset, epochs=5, steps_per_epoch=math.ceil(num_train_example / BATCH_SIZE))

    model.save('saved_models\\Fashion_MNIST_Classify_example_not_normalize_pixel')

    tess_loss, test_accuracy = model.evaluate(test_dataset, steps=math.ceil(num_test_example / BATCH_SIZE))
    print('Accuracy on test dataset:', test_accuracy)

    # for test_images, test_labels in test_dataset.take(1):
    #     test_images = test_images.numpy()
    #     test_labels = test_labels.numpy()
    #     predictions = model.predict(test_images)
    #     print(predictions.shape)
    #     print(predictions[0])
    #     print(np.argmax(predictions[0]))
    #     print(test_labels[0])

    # i = 12
    # plt.figure(figsize=(6,3))
    # plt.subplot(1,2,1)
    # plot_image(i, predictions, test_labels, test_images)
    # plt.subplot(1,2,2)
    # plot_value_array(i, predictions, test_labels)
    # plt.show()

    return model
Пример #14
0
# until the input size grows to 32x32
vgg = PROG_PL_VGG19(input_dims=(32, 32, 3),
                    layers_to_extract=[0, 1, 2],
                    load_weights='imagenet',
                    channel_last=True)

### DATA ###
"""
NWPU-RESISC45
This dataset requires you to download the source data manually 
into download_config.manual_dir (defaults to ~/tensorflow_datasets/manual/):

Note: this dataset does not have a test/train split.
"""
# load data #
data, info = tfds.load('resisc45', split="train", with_info=True)
# visualize data #
tfds.show_examples(data, info)

# size of entire dataset #
ds_size = info.splits["train"].num_examples
image_shape = info.features['image'].shape
# manually split ds into 80:20, train & test respectively #
test_ds_size = int(ds_size * 0.20)
train_ds_size = ds_size - test_ds_size
# split #
test_ds = data.take(test_ds_size)
train_ds = data.skip(test_ds_size)
print("size of test: {}, size of train: {}".format(test_ds_size,
                                                   train_ds_size))
Пример #15
0
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as datasets

import numpy as np

(train_data, test_data), info = datasets.load(
    name="imdb_reviews/subwords8k",
    split=(datasets.Split.TRAIN, datasets.Split.TEST),
    as_supervised=True,
    with_info=True
)

encoder = info.features['text'].encoder

# Exploration of the data
# for train_example, train_label in train_data.take(1):
#     print("Encoded text:", train_example[:10].numpy())
#     print("Label:", train_label.numpy())

# Data sanitization
BUFFER_SIZE = 1000
# The docs are broken on google.
# Found the solution on github to use the compat.v1.data.get_output_shapes method
train_output_shapes = tf.compat.v1.data.get_output_shapes(train_data)
train_batches = (train_data.shuffle(BUFFER_SIZE).padded_batch(32, train_output_shapes))
test_batches = (test_data.shuffle(BUFFER_SIZE).padded_batch(32, train_output_shapes))

# for example_batch, label_batch in train_batches.take(2):
Пример #16
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    if not data_args.data_dir:
        if data_args.version_2_with_negative:
            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")

        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

        tfds_examples = tfds.load("squad")
        train_examples = (
            SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False)
            if training_args.do_train
            else None
        )
        eval_examples = (
            SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=True)
            if training_args.do_eval
            else None
        )
    else:
        processor = SquadV2Processor() if data_args.version_2_with_negative else SquadV1Processor()
        train_examples = processor.get_train_examples(data_args.data_dir) if training_args.do_train else None
        eval_examples = processor.get_dev_examples(data_args.data_dir) if training_args.do_eval else None

    train_dataset = (
        squad_convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            doc_stride=data_args.doc_stride,
            max_query_length=data_args.max_query_length,
            is_training=True,
            return_dataset="tf",
        )
        if training_args.do_train
        else None
    )

    eval_dataset = (
        squad_convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            doc_stride=data_args.doc_stride,
            max_query_length=data_args.max_query_length,
            is_training=False,
            return_dataset="tf",
        )
        if training_args.do_eval
        else None
    )

    # Initialize our Trainer
    trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)
Пример #17
0
def load_split(batch_size,
               train,
               data_dir,
               dtype=tf.float32,
               image_size=IMAGE_SIZE,
               cache=False):
    """Creates a split from the ImageNet dataset using TensorFlow Datasets.

  Args:
    batch_size: the batch size returned by the data pipeline.
    train: Whether to load the train or evaluation split.
    data_dir: str, directory to read/write data. Defaults to the value of the
      environment variable TFDS_DATA_DIR, if set, otherwise falls back to
      '~/tensorflow_datasets'.
    dtype: data type of the image.
    image_size: The target size of the images.
    cache: Whether to cache the dataset.
  Returns:
    A `tf.data.Dataset`.
  """
    if train:
        split_size = TRAIN_IMAGES // jax.host_count()
        start = jax.host_id() * split_size
        split = 'train[{}:{}]'.format(start, start + split_size)
    else:
        split_size = EVAL_IMAGES // jax.host_count()
        start = jax.host_id() * split_size
        split = 'validation[{}:{}]'.format(start, start + split_size)

    def decode_example(example):
        if train:
            image = preprocess_for_train(example['image'], dtype, image_size)
        else:
            image = preprocess_for_eval(example['image'], dtype, image_size)
        return {'image': image, 'label': example['label']}

    ds = tfds.load('imagenet2012:5.*.*',
                   split=split,
                   data_dir=data_dir,
                   decoders={
                       'image': tfds.decode.SkipDecoding(),
                   })
    options = tf.data.Options()
    options.experimental_threading.private_threadpool_size = 48
    ds = ds.with_options(options)

    if cache:
        ds = ds.cache()

    if train:
        ds = ds.repeat()
        ds = ds.shuffle(16 * batch_size, seed=0)

    ds = ds.map(decode_example,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.batch(batch_size, drop_remainder=True)

    if not train:
        ds = ds.repeat()

    ds = ds.prefetch(10)

    return ds
Пример #18
0
def load_dataset(split,
                 batch_size,
                 name,
                 use_bfloat16,
                 normalize=True,
                 drop_remainder=True,
                 proportion=1.0,
                 validation_set=False,
                 validation_proportion=0.05,
                 aug_params=None):
    """Loads CIFAR dataset for training or testing.

  Args:
    split: tfds.Split.
    batch_size: The global batch size to use.
    name: A string indicates whether it is cifar10 or cifar100.
    use_bfloat16: data type, bfloat16 precision or float32.
    normalize: Whether to apply mean-std normalization on features.
    drop_remainder: bool.
    proportion: float, the proportion of dataset to be used.
    validation_set: bool, whether to split a validation set from training data.
    validation_proportion: float, the proportion of training dataset to be used
      as the validation split, if validation_set is set to True.
    aug_params: dict, data augmentation hyper parameters.

  Returns:
    Input function which returns a locally-sharded dataset batch.
  """
    if proportion < 0. or proportion > 1.:
        raise ValueError('proportion needs to lie in the range [0, 1]')
    if validation_proportion < 0. or validation_proportion > 1.:
        raise ValueError(
            'validation_proportion needs to lie in the range [0, 1]')
    if use_bfloat16:
        dtype = tf.bfloat16
    else:
        dtype = tf.float32
    ds_info = tfds.builder(name).info
    image_shape = ds_info.features['image'].shape
    dataset_size = ds_info.splits['train'].num_examples
    num_classes = ds_info.features['label'].num_classes
    if aug_params is None:
        aug_params = {}
    adaptive_mixup = aug_params.get('adaptive_mixup', False)
    random_augment = aug_params.get('random_augment', False)
    mixup_alpha = aug_params.get('mixup_alpha', 0)
    ensemble_size = aug_params.get('ensemble_size', 1)
    label_smoothing = aug_params.get('label_smoothing', 0.)
    if adaptive_mixup and 'mixup_coeff' not in aug_params:
        # Hard target in the first epoch!
        aug_params['mixup_coeff'] = tf.ones([ensemble_size, num_classes])
    if mixup_alpha > 0 or label_smoothing > 0:
        onehot = True
    else:
        onehot = False

    def preprocess(image, label):
        """Image preprocessing function."""
        if split == tfds.Split.TRAIN:
            image = tf.image.resize_with_crop_or_pad(image, image_shape[0] + 4,
                                                     image_shape[1] + 4)
            image = tf.image.random_crop(image, image_shape)
            image = tf.image.random_flip_left_right(image)

            # Only random augment for now.
            if random_augment:
                count = aug_params['aug_count']
                augmenter = augment_utils.RandAugment()
                augmented = [augmenter.distort(image) for _ in range(count)]
                image = tf.stack(augmented)

        if split == tfds.Split.TRAIN and aug_params['augmix']:
            augmenter = augment_utils.RandAugment()
            image = _augmix(image, aug_params, augmenter, dtype)
        elif normalize:
            image = normalize_convert_image(image, dtype)

        if split == tfds.Split.TRAIN and onehot:
            label = tf.cast(label, tf.int32)
            label = tf.one_hot(label, num_classes)
        else:
            label = tf.cast(label, dtype)
        return image, label

    if proportion == 1.0:
        if validation_set:
            new_name = '{}:3.*.*'.format(name)
            if split == 'validation':
                new_split = 'train[{}%:]'.format(
                    int(100 * (1. - validation_proportion)))
                dataset = tfds.load(new_name,
                                    split=new_split,
                                    as_supervised=True)
            elif split == tfds.Split.TRAIN:
                new_split = 'train[:{}%]'.format(
                    int(100 * (1. - validation_proportion)))
                dataset = tfds.load(name,
                                    split='train[:95%]',
                                    as_supervised=True)
            # split == tfds.Split.TEST case
            else:
                dataset = tfds.load(name, split=split, as_supervised=True)
        else:
            dataset = tfds.load(name, split=split, as_supervised=True)
    else:
        logging.warning(
            'Subset of training dataset is being used without a validation set.'
        )
        new_name = '{}:3.*.*'.format(name)
        if split == tfds.Split.TRAIN:
            new_split = 'train[:{}%]'.format(int(100 * proportion))
        else:
            new_split = 'test[:{}%]'.format(int(100 * proportion))
        dataset = tfds.load(new_name, split=new_split, as_supervised=True)
    if split == tfds.Split.TRAIN:
        dataset = dataset.shuffle(buffer_size=dataset_size).repeat()

    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)

    if mixup_alpha > 0 and split == tfds.Split.TRAIN:
        if adaptive_mixup:
            dataset = dataset.map(functools.partial(adaptive_mixup_aug,
                                                    batch_size, aug_params),
                                  num_parallel_calls=8)
        else:
            dataset = dataset.map(functools.partial(mixup, batch_size,
                                                    aug_params),
                                  num_parallel_calls=8)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
Пример #19
0
import tensorflow_datasets as tfds
import tensorflow as tf
imdb, info = tfds.load("imdb_reviews/subwords8k",
                       with_info=True,
                       as_supervised=True)

train_data, test_data = imdb["train"], imdb["test"]

tokenizer = info.features["text"].encoder

print(tokenizer.subwords)

sample_string = "Tensorflow, from basic to mastery"

tokenized_string = tokenizer.encode(sample_string)
print('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print('The original string: {}'.format(original_string))

for ts in tokenized_string:
    print('{} ----> {}'.format(ts, tokenizer.decode([ts])))

embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
Пример #20
0
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
print('Tensorflow version =', tf.__version__)


# ## Loading the dataset using TensorFlow Datasets

# In[2]:


dataset_size = 23262
dataset = tfds.load(name='cats_vs_dogs', as_supervised=True, split=["train"])[0]

label_map = {1:'dog', 0:'cat'}


# ## Creating train test splits

# In[3]:


test_dataset = dataset.take(3000)
train_dataset = dataset.skip(3000)


# ## Visualizing some samples from the dataset
#  - This is a dataset containing images for dogs and cats
def pre_processing_test(example):

    # extract image and label from example
    image = example["image"]
    label = example["label"]

    # image is cast to a float32 and normalized to [0, 1]
    # label is cast to a int32
    image = tf.math.divide(tf.dtypes.cast(image, tf.float32), DATA_NORM)
    label = tf.dtypes.cast(label, tf.int32)
    
    # return image and label
    return image, label

# download data and split into training and testing datasets
dataset_train, info = tfds.load("mnist", split=tfds.Split.TRAIN, with_info=True)
dataset_test,  info = tfds.load("mnist", split=tfds.Split.TEST,  with_info=True)

# debug - datasets
# print(dataset_train) # <_OptionsDataset shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}>
# print(dataset_test)  # <_OptionsDataset shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}>

# transform training dataset
dataset_train = dataset_train.map(pre_processing_train, num_parallel_calls=4)
dataset_train = dataset_train.shuffle(buffer_size=TRAINING_SHUFFLE_BUFFER)
dataset_train = dataset_train.batch(TRAINING_BATCH_SIZE)
dataset_train = dataset_train.prefetch(buffer_size=1)

# transform testing dataset
dataset_test = dataset_test.map(pre_processing_test, num_parallel_calls=4)
dataset_test = dataset_test.batch(TRAINING_BATCH_SIZE)
Пример #22
0
# In[1]:

import numpy as np
import tensorflow as tf
from tensorflow import keras

tf.random.set_seed(42)

# ## Loading the dataset :

# In[2]:

#importing dataset
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
print(datasets.keys())

# In[3]:

train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples
print(train_size, test_size)

# ## Exploring the dataset :

# In[4]:

for X_batch, y_batch in datasets["train"].batch(2).take(2):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review : ", review.decode("utf-8")[:200], "...")
Пример #23
0
#!/usr/bin/python3
# -*- coding: UTF-8 -*-

# TensorFlow
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # to get rid of the TF warnings
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
import tensorflow_datasets as tfds
# NLP
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train, valid, test = tfds.load(name="imdb_reviews",
                               split=('train[:60%]', 'train[60%:]', 'test'),
                               as_supervised=True)

Nsamples = int(1e3)
Nwords = 5  # correlation words distance
Nraw = int(1e3)
# Ndim = int(1e4)

train_iter = train.__iter__()

sentences = []
for i in range(Nsamples):
    x, y = train_iter.get_next()
    sentences.append(x.numpy().decode('utf-8'))

tokenizer = Tokenizer(num_words=Nraw, oov_token='<OOV>')
Пример #24
0
Файл: nlp.py Проект: shravanc/tf
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

train_data, valid_data = imdb['train'], imdb['test']

train_sentences = []
train_labels = []

for s, l in train_data:
    train_sentences.append(str(s.numpy()))
    train_labels.append(int(l.numpy()))

valid_sentences = []
valid_labels = []

for s, l in valid_data:
    valid_sentences.append(str(s.numpy()))
    valid_labels.append(int(l.numpy()))

train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)

vocab_size = 1000
oov_token = '<OOV>'
trun_type = 'post'
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
from position2encoding import *


examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for _, en in train_examples), target_vocab_size=2**13
)
tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, _ in train_examples), target_vocab_size=2**13
)
sample_string = "Transformer is awesome."

tokenized_string = tokenizer_en.encode(sample_string)
print("Tokenized string is {}".format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print("The original string: {}".format(original_string))
assert original_string == sample_string

for ts in tokenized_string:
Пример #26
0
# =================================================== #

import numpy as np
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.applications import VGG16


dataset_name = 'cats_vs_dogs'

train_dataset = tfds.load(name=dataset_name, split='train[:80%]')
valid_dataset = tfds.load(name=dataset_name, split='train[80%:]')

def preprocess(data):
    x = data['image']
    y = data['label']
    # image 정규화(Normalization)
    x = x / 255
    # 사이즈를 (224, 224)로 변경
    x = tf.image.resize(x, size=(224, 224))
    return x, y


def solution_model():
    batch_size=32
    train_data = train_dataset.map(preprocess).batch(batch_size)
Пример #27
0
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]
predicted_scores = export_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis=1)
for input, label in zip(inputs, predicted_labels):
    print("Question: ", input)
    print("Predicted label: ", label.numpy())

# Downloading more datasets using TensorFlow Datasets (TFDS)
train_ds = tfds.load("imdb_reviews",
                     split="train",
                     batch_size=BATCH_SIZE,
                     shuffle_files=True,
                     as_supervised=True)

val_ds = tfds.load("imdb_reviews",
                   split="train",
                   batch_size=BATCH_SIZE,
                   shuffle_files=True,
                   as_supervised=True)

for review_batch, label_batch in val_ds.take(1):
    for i in range(5):
        print("Review: ", review_batch[i].numpy())
        print("Label: ", label_batch[i].numpy())

vectorize_layer = TextVectorization(max_tokens=VOCAB_SIZE,
Пример #28
0
    predicted_label = np.argmax(predictions_array)

    if predicted_label == true_label:
        color = 'green'
    else:
        color = 'red'

    ax.set_xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                            100 * np.max(predictions_array),
                                            class_names[true_label]),
                  color=color)


(raw_test, ), metadata = tfds.load(
    'cats_vs_dogs',
    split=config.TEST_SPLIT,
    with_info=True,
    as_supervised=True,
)

test = raw_test.map(config.img_to_model_input)
test_batches = test.batch(1)

interpreter = tf.lite.Interpreter(model_path=config.TFLITE_MODEL_PATH)
interpreter.allocate_tensors()

input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]

predictions = []
test_labels, test_imgs = [], []
Пример #29
0
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import imageio
import glob
import os

BUFFER_SIZE = 50000
BATCH_SIZE = 128
LR = 2e-4
BETA1 = 0.5
EPOCHS = 100
NOISE_DIM = 100
NUM_FAKE_IMAGES = 16

cifar, info = tfds.load('cifar10', with_info=True, as_supervised=True)
train_data, test_data = cifar['train'], cifar['test']
train_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
test_data = test_data.batch(BATCH_SIZE)


class Generator(tf.keras.Model):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * 4 * 512,
                                  use_bias=False,
                                  input_shape=(100, )),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Reshape((4, 4, 512)),
Пример #30
0
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 不显示AVX, CUDA等警告

# 导入
import tensorflow as tf
import tensorflow_datasets as tfds

# 导入其他库
import matplotlib.pyplot as plt # 一种2D绘图库
import numpy as np  # 数值计算的扩展,包括数学函数
import math #数学库

#print(tf.__version__)

#2. 下载数据,包括训练数据和测试数据
dataset, metadata = tfds.load('fashion_mnist',as_supervised=True,with_info=True)
train_dataset,test_dataset = dataset['train'],dataset['test']
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
               'Sandal',      'Shirt',   'Sneaker',  'Bag',   'Ankle boot']

# 3.数据预处理
num_train_examples = metadata.splits['train'].num_examples # 训练数据集数量
num_test_examples = metadata.splits['test'].num_examples # 测试数据集数量 ,数据会放入C:\Users\xx\tensorflow_datasets\fashion_mnist\3.0.0
print("训练数据集的数量:{}".format(num_train_examples)) #60000
print("测试数据集的数量:{}".format(num_test_examples)) #10000

# 4. 标准化
#自定义归一化函数
def normalize(images,labels):
    images = tf.cast(images,tf.float32) 
    images /= 255
Пример #31
0
from itertools import combinations

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

tf.enable_v2_behavior()

def get_train_partition(data,split):
  return data[0:int(split*len(data))]
def get_test_partition(data,split):
  return data[int(split*len(data)):len(data)]

ds_train, ds_info = tfds.load(
    'iris',
    split=['train'],
    shuffle_files=False,
    as_supervised=True,
    with_info=True,
)

ds_numpy = tfds.as_numpy(ds_train)
profile_features = []
labels = []
for ex in ds_numpy[0]:
  profile_features.append(ex[0])
  labels.append(ex[1])

print("dataset size:",len(labels))

"""## Limited Data Experiments"""
print("begin experiment")
    <img src="https://tensorflow.org/images/fashion-mnist-sprite.png"
         alt="Fashion MNIST sprite"  width="600">
  </td></tr>
  <tr><td align="center">
    <b>Figure 1.</b> <a href="https://github.com/zalandoresearch/fashion-mnist">Fashion-MNIST samples</a> (by Zalando, MIT License).<br/>&nbsp;
  </td></tr>
</table>

Fashion MNIST is intended as a drop-in replacement for the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset—often used as the "Hello, World" of machine learning programs for computer vision. The MNIST dataset contains images of handwritten digits (0, 1, 2, etc) in an identical format to the articles of clothing we'll use here.

This guide uses Fashion MNIST for variety, and because it's a slightly more challenging problem than regular MNIST. Both datasets are relatively small and are used to verify that an algorithm works as expected. They're good starting points to test and debug code. 

We will use 60,000 images to train the network and 10,000 images to evaluate how accurately the network learned to classify images. You can access the Fashion MNIST directly from TensorFlow, using the [Datasets](https://www.tensorflow.org/datasets) API:
"""

dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

"""Loading the dataset returns metadata as well as a *training dataset* and *test dataset*.

* The model is trained using `train_dataset`.
* The model is tested against `test_dataset`.

The images are 28 $\times$ 28 arrays, with pixel values in the range `[0, 255]`. The *labels* are an array of integers, in the range `[0, 9]`. These correspond to the *class* of clothing the image represents:

<table>
  <tr>
    <th>Label</th>
    <th>Class</th> 
  </tr>
  <tr>
Пример #33
0
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from bs4 import BeautifulSoup
import string

imdb_sentences = []
imdb_train = tfds.as_numpy(tfds.load('imdb_reviews', split='train'))
# for item in imdb_train:
#     imdb_sentences.append(str(item['text']))

# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(imdb_sentences)
# sequences = tokenizer.texts_to_sequences(imdb_sentences)
# word_index = tokenizer.word_index

# print(word_index)

# most of the words in the index are stopwords and html tags

stopwords = ['a', ..., 'yourselves']
print(stopwords)
table = str.maketrans('', '', string.punctuation)
print(table)

for item in imdb_train:
    sentence = str(item['text'].decode('UTF-8').lower())
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()