def train_ensemble(experiment_name, model_params, base_hyperparams): data_folder = 'data' dataset = 'lung_ct_dataset' SEED = 0 result_save_path = experiment_name + '_' + 'submission.csv' tf.random.set_seed(SEED) gpu_setup() mixed_precision_setup() train_folds = tfds.load( name=dataset, split=[f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 20)], download=False, shuffle_files=False, as_supervised=True, data_dir=data_folder ) valid_folds = tfds.load( name=dataset, split=[f'train[{k}%:{k+10}%]' for k in range(0, 100, 20)], download=False, shuffle_files=False, as_supervised=True, data_dir=data_folder ) test_ds_raw, test_info_raw = tfds.load( name=dataset, split='test', download=False, shuffle_files=False, as_supervised=False, with_info=True, data_dir=data_folder ) img_ids = tfds.as_dataframe(test_ds_raw, test_info_raw) train_ds = preprocess( train_folds[0], model_params, batch_size=base_hyperparams['train_batch_size'], ds_type='train') valid_ds = preprocess( valid_folds[0], model_params, batch_size=base_hyperparams['valid_batch_size'], ds_type='valid') test_ds = preprocess(test_ds_raw, model_params, batch_size=base_hyperparams['test_batch_size'], ds_type='test') train_ds = train_ds.concatenate(valid_ds) # train on entire dataset train_ds = preprocess_ensemble(train_ds, model_params) valid_ds = preprocess_ensemble(valid_ds, model_params) test_ds = preprocess_ensemble(test_ds, model_params) ensemble_model = ensemble_learn( experiment_name, train_ds, valid_ds, model_params, base_hyperparams) predicted_labels = evaluate(ensemble_model, test_ds) save_results(img_ids, predicted_labels, result_save_path)
def __init__(self): peng = tfds.load('penguins/simple', download=True, try_gcs=True) dataset_df = tfds.as_dataframe(peng['train']) # Filter out invalid rows dataset_df = dataset_df.loc[dataset_df['sex'] != 2] records = dataset_df.to_dict(orient='records') for rec in records: ex = { 'body_mass_g': rec['body_mass_g'], 'culmen_depth_mm': rec['culmen_depth_mm'], 'culmen_length_mm': rec['culmen_length_mm'], 'flipper_length_mm': rec['flipper_length_mm'], 'island': VOCABS['island'][rec['island']], 'sex': VOCABS['sex'][rec['sex']], 'species': VOCABS['species'][rec['species']] } self._examples.append(ex)
Dense(256, activation="relu"), Dropout(0.2), Dense(256, activation="relu"), Dropout(0.2), Dense(2, activation=None) ] # No activation, pure Q-values learning_rate = 0.00025 # Learning rate gamma = 0.0 # Discount factor min_epsilon = 0.5 # Minimal and final chance of choosing random action decay_episodes = episodes // 10 # Number of episodes to decay from 1.0 to `min_epsilon`` min_class = [1] # Minority classes maj_class = [0] # Majority classes df = tfds.as_dataframe(*tfds.load("titanic", split='train', with_info=True)) y = df.survived.values df = df.drop(columns=[ "survived", "features/boat", "features/cabin", "features/home.dest", "features/name", "features/ticket" ]) df = df.astype(np.float64) df = (df - df.min()) / ( df.max() - df.min() ) # Normalization should happen after splitting train and test sets X_train, X_test, y_train, y_test = train_test_split(df.to_numpy(), y, stratify=y, test_size=0.2) X_train, y_train, X_test, y_test, X_val, y_val = get_train_test_val(
def ds_to_texts_and_labels(ds): # necessary for tokenization df = tfds.as_dataframe(ds) sentences = df['text'].to_list() return [s.decode('utf-8') for s in sentences], np.array(df['label'])
def build_dsprites_dataframe(target_path): """Recreates the dsprites dataframe from base tfds version. Each image is converted to png and written to the 'images' subfolder of the specified target_path. The dataframe contains the latent values and labels of each example, a one-hot encoding of its shape, and the path to the corresponding image. Args: target_path: Str, path to where the dataframe and images should be saved. Returns: Location where dataframe was saved. """ tfds_dataset, tfds_info = tfds.load('dsprites', split='train', with_info=True, shuffle_files=False) num_examples = tfds_info.splits['train'].num_examples # list the features we care about feature_keys = list(tfds_info.features.keys()) feature_keys.remove('image') feature_keys.remove('value_shape') feature_keys.remove('label_shape') shapes = ['square', 'ellipse', 'heart'] # helper function to modify how the data is stored in the tf dataset before # we convert it to a pandas dataframe def pandas_setup(x): # encoding the image as a png byte string turns out to be a convenient way # of temporarily storing the images until we can write them to disk. img = tf.io.encode_png(x['image']) latents = {k: x[k] for k in feature_keys} latents.update( {k: int(x['label_shape'] == i) for i, k in enumerate(shapes)}) latents['png'] = img return latents temp_ds = tfds_dataset.map(pandas_setup) dsprites_df = tfds.as_dataframe(temp_ds) dsprites_df = dsprites_df[shapes + feature_keys + ['png']] # reorder columns # setup for saving the pngs to disk if os.path.basename(target_path).endswith('.csv'): dataset_dir = os.path.dirname(target_path) dataframe_location = target_path else: dataset_dir = target_path dataframe_location = os.path.join(target_path, 'dsprites_df.csv') images_path = os.path.join(dataset_dir, 'images') tf.io.gfile.makedirs(images_path) # creates any missing parent directories padding = len(str(num_examples)) temp_index = pd.Series(range(num_examples)) def create_image_paths(x): path_to_file = os.path.join(images_path, str(x).zfill(padding) + '.png') return path_to_file # create a col in the dataframe for the image file path dsprites_df['img_path'] = temp_index.apply(create_image_paths) # iterate through the dataframe and save each image to specified folder for i, x in dsprites_df.iterrows(): img = tf.io.decode_image(x['png']) with tf.io.gfile.GFile(x['img_path'], 'wb') as f: tf.keras.preprocessing.image.save_img(f, img.numpy(), file_format='PNG') if i % 100 == 0: logging.info('%s of %s images processed', i + 1, num_examples) dsprites_df.drop(columns=['png'], inplace=True) logging.info('finished processing images') logging.info('conversion complete, saving...') with tf.io.gfile.GFile(dataframe_location, 'wb') as f: dsprites_df.to_csv(f, index=False) # also make a copy so if you screw up the original df you don't have to run # the entire generation process again _ = data_utils.make_backup(dataframe_location) return dataframe_location
import tensorflow as tf import tensorflow_datasets as tfds import pandas as pd ds, ds_info = tfds.load('cnn_dailymail', split='train', shuffle_files=True, with_info=True) assert isinstance(ds, tf.data.Dataset) df = tfds.as_dataframe(ds, ds_info) df.head()
import numpy as np import pandas as pd import tensorflow as tf import tensorflow_datasets as tfds train = tfds.load("cnn_dailymail", split="train[0:20000]") validation = tfds.load("cnn_dailymail", split="validation[0:3000]") test = tfds.load("cnn_dailymail", split="test[0:5000]") train_df = tfds.as_dataframe(train) validation_df = tfds.as_dataframe(validation) test_df = tfds.as_dataframe(validation) # Decoding bytes to string in pandas in both columns def bytes_to_str(dataframe): ''' Takes a dataframe column and converts it into a proper string ''' dataframe['article'] = dataframe['article'].str.decode("utf-8") dataframe['highlights'] = dataframe['highlights'].str.decode("utf-8") return dataframe if __name__ == "__main__": # Create a list of dataframes and convert byte data to string df_list = [train_df, test_df, validation_df] for dframe in df_list:
class_names = [ 'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot' ] def get_name(id): return class_names[id] num_train_examples = metadata.splits['train'].num_examples num_test_examples = metadata.splits['test'].num_examples print("Number of training examples: {}".format(num_train_examples)) print("Number of test examples: {}".format(num_test_examples)) tfds.as_dataframe(train_dataset.take(5), metadata) # ### Normalize # # Preprocess the data # The value of each pixel in the image data is an integer in the range `[0,255]`. # For the model to work properly, these values need to be normalized to the range `[0,1]`. # So here we create a normalization function, and then apply it to each image in the test and train datasets. def normalize(images, labels): images = tf.cast(images, tf.float32) images /= 255 return images, labels
#looking at just one sample of our data pt = data_train.take(1) # type(pt) #can convert this TakeDataset object to a numpy array (can do this for the whole dataset too) print("Features and label for first entry") for features, label in tfds.as_numpy(pt): print(features,label) #we want to load dataset as a a dictionary of tf.Tensors (can't transform tuples to dataframe) data_train_white = tfds.load('wine_quality/white',split='train') data_train_red = tfds.load('wine_quality/red',split='train') #transform dictionary to dataframe - combining red and white wine df_white = tfds.as_dataframe(data_train_white) df_red = tfds.as_dataframe(data_train_red) df = pd.concat([df_white,df_red]) print('number of samples',len(df['quality'])) #what are our output possibilities? print('possible wine quality ratings',df['quality'].unique()) #do we have any missing data (empty or NaN entries in features or labels)? dataNans = df.isnull().values.any() if not dataNans: print("all good!") #it's helpful to separate our input features from our target features (quality)
print(tf.config.list_logical_devices()) from datetime import datetime print(tf.__version__) from typing import Dict, Text import tensorflow_datasets as tfds import tensorflow_recommenders as tfrs # Ratings data. ratings = tfds.load('movielens/100k-ratings', split="train") # Features of all the available movies. movies = tfds.load('movielens/100k-movies', split="train") x = tfds.as_dataframe(ratings) y = tfds.as_dataframe(movies) print(x) print(y) # Select the basic features. ratings = ratings.map(lambda x: { "movie_id": tf.strings.to_number(x["movie_id"]), "user_id": tf.strings.to_number(x["user_id"]) }) movies = movies.map(lambda x: tf.strings.to_number(x["movie_id"])) x = tfds.as_dataframe(ratings) y = tfds.as_dataframe(movies) print(x) print(y)
data_dir = 'D:\\Sandbox\\Github\\DATA_TFDS' dataset, info = tfds.load( name="cifar10", data_dir=data_dir, with_info=True, as_supervised=True, # mutually exclusive with split shuffle_files=True, download=False) print(info.features["label"].names) print(info.features["label"].int2str(7)) train_dataset, test_dataset = dataset['train'], dataset['test'] tfds.as_dataframe(train_dataset.take(5), info) def normalize(images, labels): images = tf.cast(images, tf.float32) images /= 255 return images, labels # ### Prepare data for Model BATCH_SIZE = 64 TRAIN_SIZE = len(train_dataset) # memory max = 1000 # if repeat() then model needs epoch/step # train_dataset = #train_dataset.cache().repeat().shuffle(TRAIN_SIZE).batch(BATCH_SIZE).prefetch(TRAIN_SIZE)