def imagenet_adversarial( preprocessing_fn: Callable = None, dirpath: str = os.path.join(paths.DATASETS, "external", "imagenet_adv"), ) -> (np.ndarray, np.ndarray, np.ndarray): """ ILSVRC12 adversarial image dataset for ResNet50 ProjectedGradientDescent Iterations = 10 Max pertibation epsilon = 8 Attack step size = 2 Targeted = True :param preprocessing_fn: Callable function to preprocess inputs :param dirpath: Directory where the dataset is stored :return: (Adversarial_images, Labels) """ def _parse(serialized_example): ds_features = { "height": tf.io.FixedLenFeature([], tf.int64), "width": tf.io.FixedLenFeature([], tf.int64), "label": tf.io.FixedLenFeature([], tf.int64), "adv-image": tf.io.FixedLenFeature([], tf.string), "clean-image": tf.io.FixedLenFeature([], tf.string), } example = tf.io.parse_single_example(serialized_example, ds_features) clean_img = tf.io.decode_raw(example["clean-image"], tf.float32) clean_img = tf.reshape(clean_img, (example["height"], example["width"], -1)) adv_img = tf.io.decode_raw(example["adv-image"], tf.float32) adv_img = tf.reshape(adv_img, (example["height"], example["width"], -1)) label = tf.cast(example["label"], tf.int32) return clean_img, adv_img, label num_images = 1000 filename = "ILSVRC12_ResNet50_PGD_adversarial_dataset_v0.1.tfrecords" output_filepath = os.path.join(dirpath, filename) os.makedirs(dirpath, exist_ok=True) download_file_from_s3( bucket_name="armory-public-data", key=f"imagenet-adv/{filename}", local_path=output_filepath, ) adv_ds = tf.data.TFRecordDataset(filenames=[output_filepath]) image_label_ds = adv_ds.map(lambda example_proto: _parse(example_proto)) image_label_ds = image_label_ds.batch(num_images) image_label_ds = tf.data.experimental.get_single_element(image_label_ds) clean_x, adv_x, labels = tfds.as_numpy(image_label_ds) # Temporary flip from BGR to RGB since dataset was saved in BGR. clean_x = clean_x[..., ::-1] adv_x = adv_x[..., ::-1] # Preprocessing should always be done on RGB inputs if preprocessing_fn: clean_x = preprocessing_fn(clean_x) adv_x = preprocessing_fn(adv_x) return clean_x, adv_x, labels
from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error tf.enable_v2_behavior() from joblib import Parallel, delayed ds_train, ds_info = tfds.load( 'howell', split=['train'], shuffle_files=False, with_info=True, ) ds_numpy = tfds.as_numpy(ds_train) profile_features = [] labels = [] for ex in ds_numpy[0]: profile_features.append([ex['age'],ex['height'],ex['male']]) labels.append(ex['weight']) print("dataset size:",len(labels)) """## Limited Data Experiments""" print("begin experiment") num_trials = 10 this_train_sizes = np.linspace(1/len(labels),1,len(labels)) results = [0 for i in range(len(this_train_sizes)*num_trials)] results = Manager().list([0 for i in range(len(this_train_sizes)*num_trials)])
def cifar10_test(model, num_label=4000): # load data on the cpu with tf.device('/CPU:0'): # Load in training and test data X_train, y_train = tfds.as_numpy( tfds.load('cifar10', split='train', as_supervised=True, batch_size=-1)) #cifar_10.load_cifar_10() X_test, y_test = tfds.as_numpy( tfds.load('cifar10', split='test', as_supervised=True, batch_size=-1)) # one-hot encode the outs y_train = np.eye(10)[y_train.reshape(-1)] # print('y_train sample:', y_train[0:10]) y_test = np.eye(10)[y_test.reshape(-1)] # print('y_test sample:', y_test[0:10]) # cast it all to floats for image augmentation, rescale to [0,1] X_train = X_train.astype('float32') / np.float(255.0) X_test = X_test.astype('float32') / np.float(255.0) # whiten the data or apply zca X_train = whiten_norm(X_train) X_test = whiten_norm(X_test) # X_train = whiten_norm(X_train) # X_test = whiten_norm(X_test) # X_train, y_train, X_test, y_test = cifar_10.load_cifar_10() print('loaded cifar10', X_train.shape, X_test.shape) # Setup test set test = util.Data(X_test, y_test, None) # Split training test into labeled and unlabeled train = util.label_unlabel_split(X_train, y_train, num_label, 10) # Split training data into training and validation (train, valid) = util.train_test_valid_split(train.X, train.y, split=(0.9, 0.1), U=train.U) # One-hot encode cifar_10.y_train and cifar_10.y_test? ## ^^ yes. Done. print('TR:', train.X.shape, train.y.shape, train.U.shape) print('v', valid.X.shape, valid.y.shape) # fit on the gpu with tf.device('/GPU:0'): # Train model using training and validation sets hist = model.fit(train, valid) print('evaluating on (subset) of test set...') with tf.device('/CPU:0'): # Test the model using test set y_pred = model.predict(test.X[0:1000]) # if outputs are one-hot encoded, need to decode for correctness test # wrong = util.percent_wrong(y_pred, test.y) # acc = 1.0 - wrong acc = float( tf.reduce_mean( tf.keras.metrics.categorical_accuracy(test.y[0:1000], y_pred))) print(model.name, ' : acc:', acc) return model, {'hist': hist, 'acc': acc}
import tensorflow as tf import tensorflow_datasets as tfds import matplotlib.pyplot as plt import numpy as np #Put on VSC terminal to activate virtualenv after # Set-ExecutionPolicy Unrestricted -Scope Process #print(len(tf.config.list_physical_devices('GPU'))) ###### Shows a grid with different examples # (ds_train, ds_test), ds_info = tfds.load( # 'mnist', # split=['train', 'test'], # shuffle_files=True, # as_supervised=True, # with_info=True, # ) #print(ds_info) #fig = tfds.show_examples(ds_train, ds_info) ###### This does the same as DatasetPLT.py with more code dataset = tfds.load('mnist') train, test = dataset['train'], dataset['test'] dsnp = np.vstack(tfds.as_numpy(test)) X_test = np.array(tuple(map(lambda x: x[0]['image'], dsnp))) y_test = np.array(tuple(map(lambda x: x[0]['label'], dsnp))) plt.imshow(X_test[1], cmap='gray') plt.show()
def load(split, *, preprocess_mode, batch_dims, transpose=False, allow_caching=False): """Loads the given split of the dataset.""" start, end = _shard(split, jax.host_id(), jax.host_count()) total_batch_size = np.prod(batch_dims) tfds_split = tfds.core.ReadInstruction(_to_tfds_split(split), from_=start, to=end, unit='abs') ds = tfds.load('imagenet2012:5.*.*', split=tfds_split, decoders={'image': tfds.decode.SkipDecoding()}) options = ds.options() options.experimental_threading.private_threadpool_size = 48 options.experimental_threading.max_intra_op_parallelism = 1 if preprocess_mode is not PreprocessMode.EVAL: options.experimental_deterministic = False if jax.host_count() > 1 and allow_caching: # Only cache if we are reading a subset of the dataset. ds = ds.cache() ds = ds.repeat() ds = ds.shuffle(buffer_size=10 * total_batch_size, seed=0) else: if split.num_examples % total_batch_size != 0: raise ValueError( f'Test/valid must be divisible by {total_batch_size}') def preprocess_pretrain(example): view1 = _preprocess_image(example['image'], mode=preprocess_mode) view2 = _preprocess_image(example['image'], mode=preprocess_mode) label = tf.cast(example['label'], tf.int32) return {'view1': view1, 'view2': view2, 'labels': label} def preprocess_linear_train(example): image = _preprocess_image(example['image'], mode=preprocess_mode) label = tf.cast(example['label'], tf.int32) return {'images': image, 'labels': label} def preprocess_eval(example): image = _preprocess_image(example['image'], mode=preprocess_mode) label = tf.cast(example['label'], tf.int32) return {'images': image, 'labels': label} if preprocess_mode is PreprocessMode.PRETRAIN: ds = ds.map(preprocess_pretrain, num_parallel_calls=tf.data.experimental.AUTOTUNE) elif preprocess_mode is PreprocessMode.LINEAR_TRAIN: ds = ds.map(preprocess_linear_train, num_parallel_calls=tf.data.experimental.AUTOTUNE) else: ds = ds.map(preprocess_eval, num_parallel_calls=tf.data.experimental.AUTOTUNE) def transpose_fn(batch): # We use the double-transpose-trick to improve performance for TPUs. Note # that this (typically) requires a matching HWCN->NHWC transpose in your # model code. The compiler cannot make this optimization for us since our # data pipeline and model are compiled separately. batch = dict(**batch) if preprocess_mode is PreprocessMode.PRETRAIN: batch['view1'] = tf.transpose(batch['view1'], (1, 2, 3, 0)) batch['view2'] = tf.transpose(batch['view2'], (1, 2, 3, 0)) else: batch['images'] = tf.transpose(batch['images'], (1, 2, 3, 0)) return batch for i, batch_size in enumerate(reversed(batch_dims)): ds = ds.batch(batch_size) if i == 0 and transpose: ds = ds.map(transpose_fn) # NHWC -> HWCN ds = ds.prefetch(tf.data.experimental.AUTOTUNE) yield from tfds.as_numpy(ds)
return net def get_eval_metric_options_fn(gan_model): real_data_logits = tf.reduce_mean(gan_model.discriminator_real_outputs) gen_data_logits = tf.reduce_mean(gan_model.discriminator_gen_outputs) return { 'real_data_logits': tf.metrics.mean(real_data_logits), 'gen_data_logits': tf.metrics.mean(gen_data_logits) } params = {'batch_size': 64, 'noise_dims': 64} with tf.Graph().as_default(): ds = input_fn(tf.estimator.ModeKeys.TRAIN, params) numpy_imgs = tfds.as_numpy(ds).__next__()[1] img_grid = tfgan.eval.python_image_grid(numpy_imgs, grid_shape=(8, 8)) plt.axis('off') plt.imshow(np.squeeze(img_grid)) plt.show() train_batch_size = 64 noise_dimensions = 32 generator_lr = 0.0001 discriminator_lr = 0.00005 def gen_opt(): gstep = tf.train.get_or_create_global_step() base_lr = generator_lr lr = tf.cond(gstep < 1000, lambda: base_lr, lambda: base_lr / 2.0)
from tensorflow.keras.losses import categorical_crossentropy, sparse_categorical_crossentropy from tensorflow.keras.layers import Conv2D, BatchNormalization, MaxPool2D, Flatten, Dense, Dropout from sklearn.metrics import classification_report import numpy as np from tensorflow.keras.regularizers import l2 (ds_train, ds_test), ds_info = tfds.load( name='cifar10', split=['train', 'test'], shuffle_files=True, as_supervised=True, with_info=True, ) label_train = [] # [1] for image, label in tfds.as_numpy(ds_train): label_train.append(label) label_test = [] for image, label in tfds.as_numpy(ds_test): label_test.append(label) CLASS_NAMES = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] # TFDS provide the images as tf.uint8, while the model expect tf.float32, so normalize images def normalize_img(image, label): """Normalizes images: `uint8` -> `float32`."""
# Fetch the dataset directly mnist = tfds.image.MNIST() # or by string name mnist = tfds.builder('mnist') # Download the data, prepare it, and write it to disk mnist.download_and_prepare() # Load data from disk as tf.data.Datasets datasets = mnist.as_dataset() train_dataset, test_dataset = datasets['train'], datasets['test'] # convert the Dataset to NumPy arrays and flatten the data Xtrain, ytrain = [], [] for example in tfds.as_numpy(train_dataset): Xtrain.append(example['image'].flatten()) ytrain.append(example['label']) Xtrain, ytrain = np.asarray(Xtrain), np.asarray(ytrain) Xtrain = Xtrain.astype(float) # set random seed and shuffle the data np.random.seed(1) idx = np.arange(len(ytrain)) np.random.shuffle(idx) Xtrain, ytrain = Xtrain[idx, :], ytrain[idx] Xtrain.shape, ytrain.shape # convert the test set to NumPy arrays and flatten the data
def train( model: nn.Model, learning_rate: float = None, num_epochs: int = None, seed: int = None, model_dir: Text = None, data_source: Any = None, batch_size: int = None, checkpoints_to_keep: int = None, l2_reg: float = None, ) -> Tuple[Dict[Text, Any], nn.Model]: """Training loop. Args: model: An initialized model to be trained. learning_rate: The learning rate. num_epochs: Train for this many epochs. seed: Seed for shuffling. model_dir: Directory to save best model. data_source: The data source with pre-processed data examples. batch_size: The batch size to use for training and validation data. l2_reg: L2 regularization weight. Returns: A dict with training statistics and the best model. """ rng = jax.random.PRNGKey(seed) optimizer = flax.optim.Adam(learning_rate=learning_rate).create(model) stats = collections.defaultdict(list) best_score = 0. train_batches = input_pipeline.get_shuffled_batches( data_source.train_dataset, batch_size=batch_size, seed=seed) valid_batches = input_pipeline.get_batches(data_source.valid_dataset, batch_size=batch_size) for epoch in range(num_epochs): train_metrics = collections.defaultdict(float) # Train for one epoch. for ex in tfds.as_numpy(train_batches): inputs, lengths, labels = ex['sentence'], ex['length'], ex['label'] optimizer, loss, rng = train_step(optimizer, inputs, lengths, labels, rng, l2_reg) train_metrics['loss'] += loss * inputs.shape[0] train_metrics['total'] += inputs.shape[0] # Evaluate on validation data. optimizer.target is the updated model. valid_metrics = evaluate(optimizer.target, valid_batches) log(stats, epoch, train_metrics, valid_metrics) # Save a checkpoint if this is the best model so far. if valid_metrics['acc'] > best_score: best_score = valid_metrics['acc'] flax.training.checkpoints.save_checkpoint(model_dir, optimizer.target, epoch + 1, keep=checkpoints_to_keep) # Done training. Restore best model. logging.info('Training done! Best validation accuracy: %.2f', best_score) best_model = flax.training.checkpoints.restore_checkpoint(model_dir, model) return stats, best_model
def _generator_from_tfds( dataset_name: str, split_type: str, batch_size: int, epochs: int, dataset_dir: str, preprocessing_fn: Callable, as_supervised: bool = True, supervised_xy_keys=None, download_and_prepare_kwargs=None, variable_length=False, shuffle_files=True, cache_dataset: bool = True, framework: str = "numpy", lambda_map: Callable = None, ) -> Union[ArmoryDataGenerator, tf.data.Dataset]: """ If as_supervised=False, must designate keys as a tuple in supervised_xy_keys: supervised_xy_keys=('video', 'label') # ucf101 dataset if variable_length=True and batch_size > 1: output batches are 1D np.arrays of objects lambda_map - if not None, mapping function to apply to dataset elements """ if not dataset_dir: dataset_dir = paths.runtime_paths().dataset_dir if cache_dataset: _cache_dataset( dataset_dir, dataset_name=dataset_name, ) if framework == "pytorch": logger.warning( "PyTorch Dataset loaders are experimental!! Support for multi-worker loading is still to come." ) if not shuffle_files: raise ValueError( "Armory PyTorch DataLoaders use dareblopy which shuffles reads from TFRecord files by default" ) ds_name, ds_version = dataset_name.split(":") dataset_map = _get_pytorch_dataset_map() if ds_name not in dataset_map.keys(): raise NotImplementedError( f"PyTorch DataLoader for `{ds_name}` not yet available.") ds = dataset_map[ds_name](ds_name, ds_version, split_type, epochs) generator = torch.utils.data.DataLoader(ds, batch_size=batch_size, num_workers=0) else: default_graph = tf.compat.v1.keras.backend.get_session().graph ds, ds_info = tfds.load( dataset_name, split=split_type, as_supervised=as_supervised, data_dir=dataset_dir, with_info=True, download_and_prepare_kwargs=download_and_prepare_kwargs, shuffle_files=shuffle_files, ) if not as_supervised: try: x_key, y_key = supervised_xy_keys except (TypeError, ValueError): raise ValueError( f"When as_supervised=False, supervised_xy_keys must be a (x_key, y_key)" f" tuple, not {supervised_xy_keys}") if not isinstance(x_key, str) or not isinstance(y_key, str): raise ValueError(f"supervised_xy_keys be a tuple of strings," f" not {type(x_key), type(y_key)}") ds = ds.map(lambda x: (x[x_key], x[y_key])) if lambda_map is not None: ds = ds.map(lambda_map) ds = ds.repeat(epochs) if shuffle_files: ds = ds.shuffle(batch_size * 10, reshuffle_each_iteration=True) if variable_length and batch_size > 1: ds = ds.batch(1, drop_remainder=False) else: ds = ds.batch(batch_size, drop_remainder=False) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) if framework == "numpy": ds = tfds.as_numpy(ds, graph=default_graph) generator = ArmoryDataGenerator( ds, size=ds_info.splits[split_type].num_examples, batch_size=batch_size, epochs=epochs, preprocessing_fn=preprocessing_fn, variable_length=bool(variable_length and batch_size > 1), ) elif framework == "tf": generator = ds else: raise ValueError( f"`framework` must be one of ['tf', 'pytorch', 'numpy']. Found {framework}" ) return generator
def tf_dataset(dataset_pars): """ dataset_pars ={ "dataset_id" : "mnist", "batch_size" : 5000, "n_train": 500, "n_test": 500, "out_path" : "dataset/vision/mnist2/" } tf_dataset(dataset_pars) https://www.tensorflow.org/datasets/api_docs/python/tfds import tensorflow_datasets as tfds import tensorflow as tf # Here we assume Eager mode is enabled (TF2), but tfds also works in Graph mode. print(tfds.list_builders()) # Construct a tf.data.Dataset ds_train = tfds.load(name="mnist", split="train", shuffle_files=True) # Build your input pipeline ds_train = ds_train.shuffle(1000).batch(128).prefetch(10) for features in ds_train.take(1): image, label = features["image"], features["label"] NumPy Usage with tfds.as_numpy train_ds = tfds.load("mnist", split="train") train_ds = train_ds.shuffle(1024).batch(128).repeat(5).prefetch(10) for example in tfds.as_numpy(train_ds): numpy_images, numpy_labels = example["image"], example["label"] You can also use tfds.as_numpy in conjunction with batch_size=-1 to get the full dataset in NumPy arrays from the returned tf.Tensor object: train_ds = tfds.load("mnist", split=tfds.Split.TRAIN, batch_size=-1) numpy_ds = tfds.as_numpy(train_ds) numpy_images, numpy_labels = numpy_ds["image"], numpy_ds["label"] FeaturesDict({ 'identity_attack': tf.float32, 'insult': tf.float32, 'obscene': tf.float32, 'severe_toxicity': tf.float32, 'sexual_explicit': tf.float32, 'text': Text(shape=(), dtype=tf.string), 'threat': tf.float32, 'toxicity': tf.float32, }) """ import tensorflow_datasets as tfds d = dataset_pars dataset_id = d['dataset_id'] batch_size = d.get('batch_size', -1) # -1 neans all the dataset n_train = d.get("n_train", 500) n_test = d.get("n_test", 500) out_path = path_norm(d['out_path']) name = dataset_id.replace(".", "-") os.makedirs(out_path, exist_ok=True) train_ds = tfds.as_numpy( tfds.load(dataset_id, split=f"train[0:{n_train}]", batch_size=batch_size)) test_ds = tfds.as_numpy( tfds.load(dataset_id, split=f"test[0:{n_test}]", batch_size=batch_size)) # test_ds = tfds.as_numpy( tfds.load(dataset_id, split= f"test[0:{n_test}]", batch_size=batch_size) ) print("train", train_ds.shape) print("test", test_ds.shape) def get_keys(x): if "image" in x.keys(): xkey = "image" if "text" in x.keys(): xkey = "text" return xkey for x in train_ds: #print(x) xkey = get_keys(x) np.savez_compressed(out_path + f"{name}_train", X=x[xkey], y=x.get('label')) for x in test_ds: #print(x) np.savez_compressed(out_path + f"{name}_test", X=x[xkey], y=x.get('label')) print(out_path, os.listdir(out_path))
def load( split: Split, *, is_training: bool, batch_dims: Sequence[int], dtype: jnp.dtype = jnp.float32, transpose: bool = False, zeros: bool = False, ) -> Generator[Batch, None, None]: """Loads the given split of the dataset.""" if zeros: h, w, c = 224, 224, 3 if transpose: image_dims = (*batch_dims[:-1], h, w, c, batch_dims[0]) else: image_dims = (*batch_dims, h, w, c) batch = { 'images': np.zeros(image_dims, dtype=dtype), 'labels': np.zeros(batch_dims, dtype=np.uint32) } if is_training: yield from it.repeat(batch) else: num_batches = split.num_examples // np.prod(batch_dims) yield from it.repeat(batch, num_batches) if is_training: start, end = _shard(split, jax.host_id(), jax.host_count()) else: start, end = _shard(split, 0, 1) tfds_split = tfds.core.ReadInstruction(_to_tfds_split(split), from_=start, to=end, unit='abs') ds = tfds.load('imagenet2012:5.*.*', split=tfds_split, decoders={'image': tfds.decode.SkipDecoding()}) total_batch_size = np.prod(batch_dims) options = tf.data.Options() options.experimental_threading.private_threadpool_size = 48 options.experimental_threading.max_intra_op_parallelism = 1 options.experimental_optimization.map_parallelization = True if is_training: options.experimental_deterministic = False ds = ds.with_options(options) if is_training: if jax.host_count() > 1: # Only cache if we are reading a subset of the dataset. ds = ds.cache() ds = ds.repeat() ds = ds.shuffle(buffer_size=10 * total_batch_size, seed=0) else: if split.num_examples % total_batch_size != 0: raise ValueError( f'Test/valid must be divisible by {total_batch_size}') def preprocess(example): image = _preprocess_image(example['image'], is_training) label = tf.cast(example['label'], tf.int32) return {'images': image, 'labels': label} ds = ds.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) def transpose_fn(batch): # We use the "double transpose trick" to improve performance for TPUs. Note # that this (typically) requires a matching HWCN->NHWC transpose in your # model code. The compiler cannot make this optimization for us since our # data pipeline and model are compiled separately. batch = dict(**batch) batch['images'] = tf.transpose(batch['images'], (1, 2, 3, 0)) return batch def cast_fn(batch): batch = dict(**batch) batch['images'] = tf.cast(batch['images'], tf.dtypes.as_dtype(dtype)) return batch for i, batch_size in enumerate(reversed(batch_dims)): ds = ds.batch(batch_size) if i == 0: if transpose: ds = ds.map(transpose_fn) # NHWC -> HWCN # NOTE: You may be tempted to move the casting earlier on in the pipeline, # but for bf16 some operations will end up silently placed on the TPU and # this causes stalls while TF and JAX battle for the accelerator. if dtype != jnp.float32: ds = ds.map(cast_fn) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) yield from tfds.as_numpy(ds)
if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # model = t5.models.HfPyTorchModel("t5-base", "/tmp/hft5/", device) model = t5.models.HfPyTorchModel("t5-base", MODEL_DIR, device) ds = tfds.load( "glue/cola", data_dir=DATA_DIR, # Download data locally for preprocessing to avoid using GCS space. download_and_prepare_kwargs={"download_dir": "./downloads"}) print("A few raw validation examples...") for ex in tfds.as_numpy(ds["validation"].take(2)): print(ex) possible_labels = [0, 1] def randomString(): stringLength = random.randint(1, 15) """Generate a random string of random length """ letters = string.ascii_lowercase return ''.join(random.choice(letters) for i in range(stringLength)) label_map = {} label_set = set()
def evaluation(input_sentence, subword_encoder_en, subword_encoder_zh, model): encoded_input = subword_encoder_en.encode(input_sentence) encoded_input = tf.expand_dims(encoded_input, 0) # add one more dimension meaning the batch_size output = model.predict(encoded_input) output = tf.squeeze(0, output) print(output.shape) decoded_output = subword_encoder_zh.decode(predition for predition in np.argmax(output, axis=1)) print(decoded_output) if __name__ == '__main__': builder = fetch_data(download_dir, builder_name, config) dataset = builder.as_dataset() train_data = dataset["train"] test_data = dataset["test"] train_sentences = tfds.as_numpy(train_data) test_sentences = tfds.as_numpy(test_data) en_sentences = [] zh_sentences = [] # for ex in train_sentences: # if index > 3: # break # index += 1 # print("en train sentence {}: {}".format(index, str(ex["en"], encoding="utf-8"))) # print("zh train sentence {}: {}".format(index, str(ex["zh"], encoding="utf-8"))) # index = 0 # for ex in test_sentences: # if index > 3: # break # index += 1 # print("en test sentence {}: {}".format(index, str(ex["en"], encoding="utf-8")))
image_size = 0 # ds, info = tfds.load("plant_village", split="train[:80%]",shuffle_files=True, # as_supervised=True, with_info=True) # ds_test,info= tfds.load("plant_village", split="train[-20%:]",shuffle_files=True, # as_supervised=True, with_info=True) x_train=[] y_train=[] x_test=[] y_test=[] num_classes = 38 ds = tfds.load("plant_village", split=tfds.Split.TRAIN, batch_size=-1) ds = tfds.as_numpy(ds) images, labels = ds["image"], ds["label"] images_new=[] for i in range(20000): img=images[i] img=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) img=cv2.cvtColor(img,cv2.COLOR_GRAY2BGR) images_new.append(img) # ds = tfds.load("plant_village", split='train', batch_size=-1,as_supervised=True) # ds = tfds.as_numpy(ds) # images, labels = ds["image"], ds["label"] images_new=np.array(images_new) x_train, x_test, y_train, y_test = train_test_split( images_new[:20000], labels[:20000], test_size=0.2, random_state=42 ) print(x_train.shape, x_test.shape)
import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.applications import resnet50 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('batchsize', type=int) parser.add_argument('batchcount', type=int) args = parser.parse_args() ds_all, info = tfds.load('imagenet_resized/32x32', with_info=True, split="train") classes = info.features["label"].num_classes shape = info.features['image'].shape model = resnet50.ResNet50(weights=None, input_shape=shape, classes=classes) model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy']) for batch in tfds.as_numpy( ds_all.take(args.batchsize * args.batchcount).batch( args.batchsize)): np_image, np_label = batch["image"], batch["label"] model.fit(np_image, np_label, epochs=1, verbose=0)
datetime.now().strftime('%d-%m-%Y-%H:%M:%S')) DATA_SIZE = 100 def scheduler(epoch): lr = float(LEARNING_RATE * tf.math.exp(-(epoch - 1) * LEARNING_RATE_DECAY)) print(lr) return lr print('loading dataset...') #Load training data print('loading training data...') train_ds = tfds.load(name="svhn_cropped", split=tfds.Split.TRAIN) train_list = list(tfds.as_numpy(train_ds)) #Load test data print('loading test data...') test_ds = tfds.load(name="svhn_cropped", split=tfds.Split.TEST) test_list = list(tfds.as_numpy(test_ds)) #x_train is the data for training the dataset #y_train is the set of labels to all the data in x_train x_train = list() y_train = list() #x_test is de data for testing the dataset #y_test is the set of labels to all the data in x_test x_test = list() y_test = list()
def load_celeba_data_classifier(batch_size): #lines = [line.rstrip() for line in open('C:/Users/marku/Desktop/list_attr_celeba.txt', 'r')] lines = [line.rstrip() for line in open('/user/student.aau.dk/mjuuln15/list_attr_celeba.txt', 'r')] all_attr_names = lines[1].split() attr2idx = {} idx2attr = {} mask = [] dataset = [] labels = [] for i, attr_name in enumerate(all_attr_names): attr2idx[attr_name] = i idx2attr[i] = attr_name lines = lines[2:] for i, line in enumerate(lines): split = line.split() values = split[1:] temp_label = [] has_attribute = False for attr_name in ['Arched_Eyebrows', 'Attractive', 'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open', 'No_Beard', 'Oval_Face', 'Pointy_Nose', 'Smiling', 'Wavy_Hair', 'Wearing_Lipstick', 'Young']: idx = attr2idx[attr_name] if not has_attribute: has_attribute = (values[idx] == '1') temp_label.append(int(values[attr2idx[attr_name]])) if has_attribute: labels.append(temp_label) mask.append(has_attribute) images = glob.glob('C:/Users/marku/Desktop/img_align_celeba/*.jpg') #images = glob.glob('/user/student.aau.dk/mjuuln15/img_align_celeba/*.jpg') for i in images: image = plt.imread(i) dataset.append(image) mask = np.array(mask) dataset = np.array(dataset) X1 = dataset[mask] X1 = X1[:200000] labels = labels[:200000] X1 = X1[8000:] X2 = X1[:8000] L1 = np.asarray(labels[8000:]) L2 = np.asarray(labels[:8000]) L1[L1 == -1] = 0 L2[L2 == -1] = 0 X1_num_examples = len(X1) X2_num_examples = len(X2) X1 = tf.data.Dataset.from_tensor_slices(X1) X2 = tf.data.Dataset.from_tensor_slices(tf.convert_to_tensor(X2)) X1 = X1.map(format_example_to128) X2 = X2.map(format_example_to128) X1 = tfds.as_numpy(X1) X2 = tfds.as_numpy(X2) test=[] for i in X1: test.extend(i) test = np.asarray(test) X1 = np.reshape(test,(X1_num_examples,128,128,3)) test=[] for i in X2: test.extend(i) test = np.asarray(test) X2 = np.reshape(test,(X2_num_examples,128,128,3)) X1 = [X1, L1] X2 = [X2, L2] return X1, X2
def run_eval(mixture_or_task_name: str, predict_or_score_fn: PredictOrScoreFnCallable, checkpoint_steps: Iterable[int], dataset_fn: Optional[Callable[ [t5.data.Task, Mapping[str, int], int, str, Optional[bool]], tf.data.Dataset]] = None, summary_dir: Optional[str] = None, split: Optional[str] = "validation", sequence_length: Optional[Mapping[str, int]] = None, batch_size: Optional[int] = None): """Run evaluation on the given mixture or task. Args: mixture_or_task_name: str, the name of the Mixture or Task to evaluate on. Must be pre-registered in the global `TaskRegistry` or `MixtureRegistry.` predict_or_score_fn: function, This function takes in the sequence length, checkpoint step, tasks to evaluate, an eval_dataset_fn, a dict mapping task names to cached examples, a dict mapping task names to datasets, and returns a list of outputs or a list of scores. checkpoint_steps: an iterator with integers for checkpoint steps to evaluate on. dataset_fn: function, This function takes a task and returns the dataset associated with it. If None, the default mesh_eval_dataset_fn is used. summary_dir: str, path to write TensorBoard events file summaries for eval. If None, use model_dir/eval_{split}. split: str, the mixture/task split to evaluate on. sequence_length: an integer or a dict from feature-key to integer the sequence length to pad or truncate to, e.g. {"inputs": 512, "targets": 128}. If None, sequence length is automatically computed during eval. batch_size: integer, used only to check that expected padding matches the targets. If None, the check is skipped. """ vocabulary = model_utils.get_vocabulary(mixture_or_task_name) tasks = t5.data.get_subtasks( t5.data.get_mixture_or_task(mixture_or_task_name)) tasks = seqio.evaluation.get_valid_eval_tasks(tasks, split) if not tasks: logging.info( "All provided tasks have metric_fns=[] or no matching splits; " "eval is not possible.") return if not dataset_fn: def _get_task_eval_dataset(task, sequence_length, split): # TODO(sharannarang): Replace with more general function. eval_datasets = mesh_transformer.mesh_eval_dataset_fn( sequence_length=sequence_length, dataset_split=split, mixture_or_task_name=task.name, ) return eval_datasets[0].dataset_fn() dataset_fn = _get_task_eval_dataset summary_writer = None cached_targets, cached_datasets, max_sequence_length = \ seqio.evaluation.get_targets_and_examples( tasks=tasks, dataset_fn=functools.partial( dataset_fn, split=split, sequence_length=None)) if summary_dir: model_utils.write_targets_and_examples(summary_dir, cached_targets, cached_datasets) if sequence_length is None: logging.info("Setting sequence lengths to %s", max_sequence_length) sequence_length = max_sequence_length elif (sequence_length["inputs"] < max_sequence_length["inputs"] or sequence_length["targets"] < max_sequence_length["targets"]): logging.warning( "Given sequence lengths are insufficient for some evaluation inputs " "or targets. These sequences will be truncated to fit, likely " "leading to sub-optimal results. Consider passing `None` for " "sequence_length to have them be automatically computed.\n Got: %s, " "\n Max Lengths:%s", sequence_length, max_sequence_length) elif (sequence_length["inputs"] > max_sequence_length["inputs"] or sequence_length["targets"] > max_sequence_length["targets"]): logging.warning( "Given sequence lengths are longer than necessary for some " "evaluation inputs or targets, resulting in wasted computation. " "Consider passing `None` for sequence_length to have them be " "automatically computed.\n Got: %s,\n Max Lengths: %s", sequence_length, max_sequence_length) for step in checkpoint_steps: logging.info("Evaluating checkpoint step: %d", step) outputs = predict_or_score_fn(checkpoint_step=step, vocabulary=vocabulary, tasks=tasks, datasets=cached_datasets, sequence_length=sequence_length) for task in tasks: # Extract the portion of decodes corresponding to this dataset dataset = cached_datasets[task.name] dataset_size = len(cached_targets[task.name]) predictions = [ task.postprocess_fn(d, example=ex) for d, ex in zip( outputs[:dataset_size], tfds.as_numpy(dataset)) ] # Remove the used decodes. del outputs[:dataset_size] if summary_dir: predictions_filename = os.path.join( summary_dir, "{}_{}_predictions".format(task.name, step)) model_utils.write_lines_to_file(predictions, predictions_filename) with tf.Graph().as_default(): if summary_dir: summary_writer = summary_writer or tf.summary.FileWriter( summary_dir) for metric_fn in task.metric_fns: if summary_dir: summary = tf.Summary() targets = cached_targets[task.name] metric_result = metric_fn(targets, predictions) for metric_name, metric_value in metric_result.items(): tag = "eval/{}/{}".format(task.name, metric_name) logging.info("%s at step %d: %.3f", tag, step, metric_value) if summary_dir: summary.value.add(tag=tag, simple_value=metric_value) summary_writer.add_summary(summary, step) # pytype: disable=attribute-error if summary_dir: summary_writer.flush() # pytype: disable=attribute-error # Only padding should remain. if batch_size: expected_pad = -sum(len(t) for t in cached_targets.values()) % batch_size if outputs and len(outputs) != expected_pad: raise ValueError("{} padded outputs, {} expected.".format( len(outputs), expected_pad))
#input_image = preprocess_input(input_image) return input_image, input_mask test_dataset = test_ds.map(load_image_test, num_parallel_calls=AUTOTUNE).batch(args.batch_size) ############################################################################### # Load the best model snapshot and evaluate the quality ############################################################################### model = load_model(args.model_path, compile=False) model.compile(optimizer='adam', loss=combined_loss, metrics=['accuracy']) print('Final test set evaluation:') test_loss, test_accuracy = model.evaluate(tfds.as_numpy(test_dataset), verbose=0, steps=2) print('Test loss: {:.4f}. Test Accuracy: {:.4f}'.format( test_loss, test_accuracy)) print('Displaying some example predictions from the test set') def display(display_list): plt.figure(figsize=(15, 15)) title = ['Input Image', 'True Mask', 'Predicted Mask'] for i in range(len(display_list)): plt.subplot(1, len(display_list), i + 1)
def test_overlap(self): self._write_tfrecord('train', 5, 'abcdefghijkl') ds = self.reader.read('mnist', 'train+train[:2]', self.SPLIT_INFOS) read_data = list(tfds.as_numpy(ds)) self.assertEqual(read_data, [six.b(l) for l in 'abcdefghijklab'])
import tensorflow_datasets as tfds # Create google cloud storage to save the tensorflow datasets. STORAGE_BUCKET = 'gs://CLOUD_STORAGE_BUCKET' data_dir = f'{STORAGE_BUCKET}/data' # Make sure that you put ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar # into the cache_dir. cache_dir = 'IMAGENET_TAR_FILE_DIR/' ds = tfds.load("imagenet2012:5.0.0", split="train", data_dir=data_dir, download_and_prepare_kwargs={ 'download_kwargs': tfds.download.DownloadConfig(manual_dir=cache_dir) }) tfds.as_numpy(ds)
def _materialize(task): list(tfds.as_numpy(TaskRegistry.get_dataset( task, {"inputs": 13, "targets": 13}, "train", use_cached=False)))
import numpy as np import pandas as pd import tensorflow_datasets as tfds if __name__ == '__main__': train_ds = tfds.load('ag_news_subset', split='train', shuffle_files=True) test_ds = tfds.load('ag_news_subset', split='test', shuffle_files=True) texts, labels = [], [] for ds in (train_ds, test_ds): for example in tfds.as_numpy(ds): text, label = example['description'], example['label'] texts.append(text.decode("utf-8")) labels.append(label) labels = np.array(labels) save = pd.DataFrame() save['texts'] = texts save['labels'] = labels save.to_csv('ag_news.csv', index=False)
def _log_mixing_proportions(tasks, datasets, rates, mixed_dataset, sequence_length, compute_stats_empirically): """Log information about the mixing proportions. Called from Mixture.get_dataset. Args: tasks: a list of Task datasets: a list of tf.data.Dataset rates: a list of floats mixed_dataset: a tf.data.Dataset sequence_length: dict from string to int (packed lengths) compute_stats_empirically: a boolean - does not work on TPU """ def _normalize(l): denom = sum(l) return [x / denom for x in l] # compute some stats about the mixture examples_fraction = _normalize(rates) if compute_stats_empirically: stats_examples = 100 mean_inputs_length = [] mean_targets_length = [] for dataset in datasets: inputs_sum = 0 targets_sum = 0 for ex in tfds.as_numpy(dataset.take(stats_examples)): inputs_sum += ex["inputs"].size targets_sum += ex["targets"].size mean_inputs_length.append(inputs_sum / float(stats_examples)) mean_targets_length.append(targets_sum / float(stats_examples)) else: def _estimated_mean_length(task, key): if task.token_preprocessor: return sequence_length[key] else: return min(sequence_length[key], (task.get_cached_stats("train")[key + "_tokens"] / task.get_cached_stats("train")["examples"])) mean_inputs_length = [ _estimated_mean_length(task, "inputs") for task in tasks ] mean_targets_length = [ _estimated_mean_length(task, "targets") for task in tasks ] inputs_fraction = _normalize( [l * r for l, r in zip(mean_inputs_length, rates)]) targets_fraction = _normalize( [l * r for l, r in zip(mean_targets_length, rates)]) logging.info("%12s %12s %12s %12s %12s %12s %s", "rate", "ex.frac.", "inp.frac.", "tgt.frac.", "inp.len.", "tgt.len", "task") for i in range(len(rates)): logging.info("%12g %12g %12g %12g %12g %12g %s", rates[i], examples_fraction[i], inputs_fraction[i], targets_fraction[i], mean_inputs_length[i], mean_targets_length[i], tasks[i].name) if compute_stats_empirically: _log_padding_fractions(mixed_dataset, sequence_length)
plt.savefig(os.path.join(figdir, fname)) import tensorflow as tf from tensorflow import keras import tensorflow_datasets as tfds #tf.enable_eager_execution() # See all registered datasets tfds.list_builders() # Load a given dataset by name, along with the DatasetInfo data, info = tfds.load("mnist", with_info=True) train_data, test_data = data['train'], data['test'] assert isinstance(train_data, tf.data.Dataset) assert info.features['label'].num_classes == 10 assert info.splits['train'].num_examples == 60000 # You can also access a builder directly builder = tfds.builder("mnist") assert builder.info.splits['train'].num_examples == 60000 builder.download_and_prepare() datasets = builder.as_dataset() # If you need NumPy arrays np_datasets = tfds.as_numpy(datasets) #data, info = tfds.load("Imagenet2012", with_info=True) # tfds.image.imagenet.Imagenet2012
print("args:", args) sc = SparkContext(conf=SparkConf().setAppName("mnist_data_setup")) classpath = os.environ['CLASSPATH'] hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin', 'hadoop') hadoop_classpath = subprocess.check_output( [hadoop_path, 'classpath', '--glob']).decode() os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath mnist, info = tfds.load('mnist', with_info=True, data_dir='hdfs:///hadoop/tfds_datasets') print(info.as_json) # convert to numpy, then RDDs mnist_train = tfds.as_numpy(mnist['train']) mnist_test = tfds.as_numpy(mnist['test']) train_rdd = sc.parallelize(mnist_train, args.num_partitions).cache() test_rdd = sc.parallelize(mnist_test, args.num_partitions).cache() # save as CSV (label,comma-separated-features) def to_csv(example): return str(example['label']) + ',' + ','.join( [str(i) for i in example['image'].reshape(784)]) train_rdd.map(to_csv).saveAsTextFile(args.output + "/csv/train") test_rdd.map(to_csv).saveAsTextFile(args.output + "/csv/test") # save as TFRecords (numpy vs. PNG) # note: the MNIST tensorflow_dataset is already provided as TFRecords but with a PNG bytes_list
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = single_precision.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Build demonstration dataset. if hasattr(raw_environment, 'raw_env'): raw_environment = raw_environment.raw_env batch_dataset = bsuite_demonstrations.make_dataset(raw_environment) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=1, additional_discount=1.) dataset = batch_dataset.map(transition) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) dataset = tfds.as_numpy(dataset) # Create the networks to optimize. policy_network = make_policy_network(environment_spec.actions) policy_network = hk.without_apply_rng(hk.transform(policy_network)) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. def evaluator_network(params: hk.Params, key: jnp.DeviceArray, observation: jnp.DeviceArray) -> jnp.DeviceArray: action_values = policy_network.apply(params, observation) return rlax.epsilon_greedy(FLAGS.epsilon).sample(key, action_values) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # The learner updates the parameters (and initializes them). learner = learning.BCLearner(network=policy_network, optimizer=optax.adam(FLAGS.learning_rate), obs_spec=environment.observation_spec(), dataset=dataset, counter=learner_counter, rng=hk.PRNGSequence(FLAGS.seed)) # Create the actor which defines how we take actions. variable_client = variable_utils.VariableClient(learner, '') evaluator = actors.FeedForwardActor(evaluator_network, variable_client=variable_client, rng=hk.PRNGSequence(FLAGS.seed)) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluator, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def dataset_as_text(ds): for ex in tfds.as_numpy(ds): yield {k: _maybe_as_text(v) for k, v in ex.items()}
def train_on_epoch(self, dataset): for i, batch in enumerate(tfds.as_numpy(dataset)): np_x = batch['image'] self.sess.run(self.train_op, {self.x: np_x})