Exemplo n.º 1
0
def train_ensemble(experiment_name, model_params, base_hyperparams):
    data_folder = 'data'
    dataset = 'lung_ct_dataset'

    SEED = 0

    result_save_path = experiment_name + '_' + 'submission.csv'

    tf.random.set_seed(SEED)
    gpu_setup()
    mixed_precision_setup()

    train_folds = tfds.load(
        name=dataset,
        split=[f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 20)],
        download=False,
        shuffle_files=False,
        as_supervised=True,
        data_dir=data_folder
    )
    valid_folds = tfds.load(
        name=dataset,
        split=[f'train[{k}%:{k+10}%]' for k in range(0, 100, 20)],
        download=False,
        shuffle_files=False,
        as_supervised=True,
        data_dir=data_folder
    )
    test_ds_raw, test_info_raw = tfds.load(
        name=dataset,
        split='test',
        download=False,
        shuffle_files=False,
        as_supervised=False,
        with_info=True,
        data_dir=data_folder
    )
    img_ids = tfds.as_dataframe(test_ds_raw, test_info_raw)

    train_ds = preprocess(
        train_folds[0], model_params, batch_size=base_hyperparams['train_batch_size'], ds_type='train')
    valid_ds = preprocess(
        valid_folds[0], model_params, batch_size=base_hyperparams['valid_batch_size'], ds_type='valid')
    test_ds = preprocess(test_ds_raw, model_params,
                         batch_size=base_hyperparams['test_batch_size'], ds_type='test')

    train_ds = train_ds.concatenate(valid_ds)  # train on entire dataset

    train_ds = preprocess_ensemble(train_ds, model_params)
    valid_ds = preprocess_ensemble(valid_ds, model_params)
    test_ds = preprocess_ensemble(test_ds, model_params)

    ensemble_model = ensemble_learn(
        experiment_name, train_ds, valid_ds, model_params, base_hyperparams)
    predicted_labels = evaluate(ensemble_model, test_ds)
    save_results(img_ids, predicted_labels, result_save_path)
Exemplo n.º 2
0
    def __init__(self):
        peng = tfds.load('penguins/simple', download=True, try_gcs=True)
        dataset_df = tfds.as_dataframe(peng['train'])

        # Filter out invalid rows
        dataset_df = dataset_df.loc[dataset_df['sex'] != 2]

        records = dataset_df.to_dict(orient='records')
        for rec in records:
            ex = {
                'body_mass_g': rec['body_mass_g'],
                'culmen_depth_mm': rec['culmen_depth_mm'],
                'culmen_length_mm': rec['culmen_length_mm'],
                'flipper_length_mm': rec['flipper_length_mm'],
                'island': VOCABS['island'][rec['island']],
                'sex': VOCABS['sex'][rec['sex']],
                'species': VOCABS['species'][rec['species']]
            }
            self._examples.append(ex)
Exemplo n.º 3
0
    Dense(256, activation="relu"),
    Dropout(0.2),
    Dense(256, activation="relu"),
    Dropout(0.2),
    Dense(2, activation=None)
]  # No activation, pure Q-values

learning_rate = 0.00025  # Learning rate
gamma = 0.0  # Discount factor
min_epsilon = 0.5  # Minimal and final chance of choosing random action
decay_episodes = episodes // 10  # Number of episodes to decay from 1.0 to `min_epsilon``

min_class = [1]  # Minority classes
maj_class = [0]  # Majority classes

df = tfds.as_dataframe(*tfds.load("titanic", split='train', with_info=True))
y = df.survived.values
df = df.drop(columns=[
    "survived", "features/boat", "features/cabin", "features/home.dest",
    "features/name", "features/ticket"
])
df = df.astype(np.float64)
df = (df - df.min()) / (
    df.max() - df.min()
)  # Normalization should happen after splitting train and test sets

X_train, X_test, y_train, y_test = train_test_split(df.to_numpy(),
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2)
X_train, y_train, X_test, y_test, X_val, y_val = get_train_test_val(
Exemplo n.º 4
0
def ds_to_texts_and_labels(ds):
    # necessary for tokenization
    df = tfds.as_dataframe(ds)
    sentences = df['text'].to_list()
    return [s.decode('utf-8') for s in sentences], np.array(df['label'])
Exemplo n.º 5
0
def build_dsprites_dataframe(target_path):
    """Recreates the dsprites dataframe from base tfds version.

  Each image is converted to png and written to the 'images' subfolder of the
  specified target_path.

  The dataframe contains the latent values and labels of each example, a one-hot
  encoding of its shape, and the path to the corresponding image.

  Args:
    target_path: Str, path to where the dataframe and images should be saved.

  Returns:
    Location where dataframe was saved.
  """

    tfds_dataset, tfds_info = tfds.load('dsprites',
                                        split='train',
                                        with_info=True,
                                        shuffle_files=False)
    num_examples = tfds_info.splits['train'].num_examples

    # list the features we care about
    feature_keys = list(tfds_info.features.keys())
    feature_keys.remove('image')
    feature_keys.remove('value_shape')
    feature_keys.remove('label_shape')
    shapes = ['square', 'ellipse', 'heart']

    # helper function to modify how the data is stored in the tf dataset before
    # we convert it to a pandas dataframe
    def pandas_setup(x):
        # encoding the image as a png byte string turns out to be a convenient way
        # of temporarily storing the images until we can write them to disk.
        img = tf.io.encode_png(x['image'])
        latents = {k: x[k] for k in feature_keys}
        latents.update(
            {k: int(x['label_shape'] == i)
             for i, k in enumerate(shapes)})
        latents['png'] = img
        return latents

    temp_ds = tfds_dataset.map(pandas_setup)
    dsprites_df = tfds.as_dataframe(temp_ds)
    dsprites_df = dsprites_df[shapes + feature_keys +
                              ['png']]  # reorder columns

    # setup for saving the pngs to disk
    if os.path.basename(target_path).endswith('.csv'):
        dataset_dir = os.path.dirname(target_path)
        dataframe_location = target_path
    else:
        dataset_dir = target_path
        dataframe_location = os.path.join(target_path, 'dsprites_df.csv')

    images_path = os.path.join(dataset_dir, 'images')
    tf.io.gfile.makedirs(images_path)  # creates any missing parent directories

    padding = len(str(num_examples))
    temp_index = pd.Series(range(num_examples))

    def create_image_paths(x):
        path_to_file = os.path.join(images_path,
                                    str(x).zfill(padding) + '.png')
        return path_to_file

    # create a col in the dataframe for the image file path
    dsprites_df['img_path'] = temp_index.apply(create_image_paths)

    # iterate through the dataframe and save each image to specified folder
    for i, x in dsprites_df.iterrows():
        img = tf.io.decode_image(x['png'])
        with tf.io.gfile.GFile(x['img_path'], 'wb') as f:
            tf.keras.preprocessing.image.save_img(f,
                                                  img.numpy(),
                                                  file_format='PNG')
        if i % 100 == 0:
            logging.info('%s of %s images processed', i + 1, num_examples)

    dsprites_df.drop(columns=['png'], inplace=True)
    logging.info('finished processing images')

    logging.info('conversion complete, saving...')
    with tf.io.gfile.GFile(dataframe_location, 'wb') as f:
        dsprites_df.to_csv(f, index=False)

    # also make a copy so if you screw up the original df you don't have to run
    # the entire generation process again
    _ = data_utils.make_backup(dataframe_location)

    return dataframe_location
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd


ds, ds_info = tfds.load('cnn_dailymail', split='train', shuffle_files=True, with_info=True)
assert isinstance(ds, tf.data.Dataset)


df = tfds.as_dataframe(ds, ds_info)

df.head()
Exemplo n.º 7
0
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

train = tfds.load("cnn_dailymail", split="train[0:20000]")
validation = tfds.load("cnn_dailymail", split="validation[0:3000]")
test = tfds.load("cnn_dailymail", split="test[0:5000]")

train_df = tfds.as_dataframe(train)
validation_df = tfds.as_dataframe(validation)
test_df = tfds.as_dataframe(validation)

# Decoding bytes to string in pandas in both columns


def bytes_to_str(dataframe):
    '''
  Takes a dataframe column and converts it into a proper string
  '''
    dataframe['article'] = dataframe['article'].str.decode("utf-8")
    dataframe['highlights'] = dataframe['highlights'].str.decode("utf-8")
    return dataframe


if __name__ == "__main__":

    # Create a list of dataframes and convert byte data to string
    df_list = [train_df, test_df, validation_df]

    for dframe in df_list:
Exemplo n.º 8
0
class_names = [
    'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt',
    'Sneaker', 'Bag', 'Ankle boot'
]


def get_name(id):
    return class_names[id]


num_train_examples = metadata.splits['train'].num_examples
num_test_examples = metadata.splits['test'].num_examples
print("Number of training examples: {}".format(num_train_examples))
print("Number of test examples:     {}".format(num_test_examples))

tfds.as_dataframe(train_dataset.take(5), metadata)

# ### Normalize

# # Preprocess the data

# The value of each pixel in the image data is an integer in the range `[0,255]`.
# For the model to work properly, these values need to be normalized to the range `[0,1]`.
# So here we create a normalization function, and then apply it to each image in the test and train datasets.


def normalize(images, labels):
    images = tf.cast(images, tf.float32)
    images /= 255
    return images, labels
Exemplo n.º 9
0
#looking at just one sample of our data
pt = data_train.take(1)
# type(pt)

#can convert this TakeDataset object to a numpy array (can do this for the whole dataset too)
print("Features and label for first entry")
for features, label in tfds.as_numpy(pt):
    print(features,label)

#we want to load dataset as a a dictionary of tf.Tensors (can't transform tuples to dataframe)
data_train_white = tfds.load('wine_quality/white',split='train')
data_train_red = tfds.load('wine_quality/red',split='train')

#transform dictionary to dataframe - combining red and white wine
df_white = tfds.as_dataframe(data_train_white)
df_red = tfds.as_dataframe(data_train_red)
df = pd.concat([df_white,df_red])

print('number of samples',len(df['quality']))

#what are our output possibilities?
print('possible wine quality ratings',df['quality'].unique())


#do we have any missing data (empty or NaN entries in features or labels)?
dataNans = df.isnull().values.any()
if not dataNans:
    print("all good!")

#it's helpful to separate our input features from our target features (quality) 
Exemplo n.º 10
0
print(tf.config.list_logical_devices())

from datetime import datetime

print(tf.__version__)

from typing import Dict, Text
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train")
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train")

x = tfds.as_dataframe(ratings)
y = tfds.as_dataframe(movies)
print(x)
print(y)

# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_id": tf.strings.to_number(x["movie_id"]),
    "user_id": tf.strings.to_number(x["user_id"])
})
movies = movies.map(lambda x: tf.strings.to_number(x["movie_id"]))

x = tfds.as_dataframe(ratings)
y = tfds.as_dataframe(movies)
print(x)
print(y)
Exemplo n.º 11
0
data_dir = 'D:\\Sandbox\\Github\\DATA_TFDS'

dataset, info = tfds.load(
    name="cifar10",
    data_dir=data_dir,
    with_info=True,
    as_supervised=True,  # mutually exclusive with split
    shuffle_files=True,
    download=False)

print(info.features["label"].names)
print(info.features["label"].int2str(7))

train_dataset, test_dataset = dataset['train'], dataset['test']

tfds.as_dataframe(train_dataset.take(5), info)


def normalize(images, labels):
    images = tf.cast(images, tf.float32)
    images /= 255
    return images, labels


# ### Prepare data for Model

BATCH_SIZE = 64
TRAIN_SIZE = len(train_dataset)  # memory max = 1000
# if repeat() then model needs epoch/step
# train_dataset = #train_dataset.cache().repeat().shuffle(TRAIN_SIZE).batch(BATCH_SIZE).prefetch(TRAIN_SIZE)