示例#1
0
def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None):
    img_shape = (image_size, image_size, 3)

    info('Loading Data Set')
    # load dataset
    train, test, val, labels = load_dataset(data_path, dataset)

    # training data
    train_data, train_labels = zip(*train)
    train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
                            Dataset.from_tensor_slices(list(train_labels))))

    train_ds = train_ds.map(map_func=process_image, 
                            num_parallel_calls=5)

    train_ds = train_ds.apply(tf.data.experimental.ignore_errors())

    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(buffer_size=5)
    train_ds = train_ds.repeat()

    # model
    info('Creating Model')
    base_model = tf.keras.applications.ResNet50(input_shape=img_shape,
                                               include_top=False, 
                                               weights='imagenet')
    base_model.trainable = True

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

    model.summary()

    # training
    info('Training')
    steps_per_epoch = math.ceil(len(train)/batch_size)
    history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)

    # save model
    info('Saving Model')
    
    # check existence of base model folder
    output = check_dir(output)

    print('Serializing into saved_model format')
    tf.saved_model.save(model, str(output))

    # add time prefix folder
    #stamp = datetime.now().strftime('%y_%m_%d_%H_%M.h5')
    #stamped = str(Path(output).joinpath(stamp))
    file_output = str(Path(output).joinpath('latest.h5'))
    #print('Serializing model to:\n{}\n{}'.format(stamped, output)
    model.save(file_output)
示例#2
0
文件: data.py 项目: Danealor/denoiser
 def apply_all(*sources):
     if len(sources) == 1: return func(sources[0])
     res = tuple(
         func(*source) if isinstance(source, tuple) else func(source)
         for source in sources)
     if all(isinstance(r, Dataset) for r in res):
         res = Dataset.zip(res)
     return res
示例#3
0
文件: curve2svg.py 项目: Jephthia/NNs
        def parse_ds(file_name):
            x_train = tfio.v0.IODataset.from_hdf5(file_name,
                                                  dataset='/x_train',
                                                  spec=tf.float32)
            y_train = tfio.v0.IODataset.from_hdf5(file_name,
                                                  dataset='/y_train',
                                                  spec=tf.float32)

            return Dataset.zip((x_train, y_train))
示例#4
0
    def load_data(self):
        img_ds = self.get_img_ds()
        svg_ds = self.get_svg_ds()

        # Batch the sequences
        ds = Dataset.zip((svg_ds, img_ds))
        ds = ds.shuffle(C.BUFFER_SIZE)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

        return ds
示例#5
0
文件: adain.py 项目: Jephthia/NNs
def get_dataset():
    def parse_img(file_name):
        img = tf.io.read_file(file_name)
        img = tf.io.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [224, 224],
                              antialias=True,
                              method='nearest')
        img = tf.cast(img, tf.float32)
        img = vgg19_preprocess_input(img) / 255.0

        return img

    def build_ds(file_names):
        tmp_ds = Dataset.from_tensor_slices(file_names)
        tmp_ds = tmp_ds.shuffle(len(file_names))
        tmp_ds = tmp_ds.map(parse_img, num_parallel_calls=tf.data.AUTOTUNE)

        return tmp_ds

    content_ds = build_ds(
        glob.glob('/home/jephthia/datasets/mscoco/unlabeled2017/train/*')[:1])
    style_ds = build_ds(
        glob.glob('/home/jephthia/datasets/wikiart/train/*')[:1])
    val_content_ds = build_ds(
        glob.glob('/home/jephthia/datasets/mscoco/unlabeled2017/validate/*')
        [:2500])
    val_style_ds = build_ds(
        glob.glob('/home/jephthia/datasets/wikiart/validate/*')[:2500])

    # Train dataset
    ds = Dataset.zip((content_ds, style_ds))
    ds = ds.batch(BATCH_SIZE)
    #     ds = ds.prefetch(2)

    # Validation dataset
    val_ds = Dataset.zip((val_content_ds, val_style_ds))
    val_ds = val_ds.batch(BATCH_SIZE)
    #     val_ds = val_ds.cache()
    #     val_ds = val_ds.prefetch(2)

    return ds, val_ds
示例#6
0
def get_data(dataset):
    if dataset == 'cifar100':
        from tensorflow.keras.datasets import cifar100
        (x_tr, y_tr), (x_te, y_te) = cifar100.load_data()
    elif dataset == 'cifar10':
        from tensorflow.keras.datasets import cifar10
        (x_tr, y_tr), (x_te, y_te) = cifar10.load_data()

    preprocesses = ([todtype, normalize], [ohe])

    x_te, y_te = preprocess(x_te, y_te, preprocesses)
    x_tr, y_tr = preprocess(x_tr, y_tr, preprocesses)

    tr_ds_x = Dataset.from_tensor_slices(x_tr)
    tr_ds_y = Dataset.from_tensor_slices(y_tr)
    te_ds_x = Dataset.from_tensor_slices(x_te)
    te_ds_y = Dataset.from_tensor_slices(y_te)

    tr_ds = Dataset.zip((tr_ds_x, tr_ds_y)).shuffle(1000).batch(128)
    te_ds = Dataset.zip((te_ds_x, te_ds_y)).batch(128)

    return tr_ds, te_ds
示例#7
0
def generate_tf_data(enc_input: list, dec_input: list, batch_size: int,
                     train_size: int, val_size: int) -> [Dataset]:
    '''Generates a tensorflow data set, splits it in train, test and validation sets.

    Problem: Feeding in three arrays containing almost two million sequences each, requires too much main memory.
    Solution: We use the Tensorflow Dataset, where we can feed the model with slices of the whole dataset.

    Also: shuffles the observations.

    Args:
        enc_input: encoder input ids, token ids for each word and each sentence
        dec_input: used for teacher forcing. Token ids for each word and each sentence in target lang.
            More specific:
                - decoder input, token sequences (index 0 in dec_input)
                - decoder target output, token sequences (for teacher forcing, index 1 in dec_input)
        batch_size: Number of observation passed to the Seq2Seq model during training time.
        train_size: Fraction of all observations to be reserved for training the model.
        val_size: Fraction of all observations to be reserved for evaluating the model performance during training.
    Returns:
        train_data: contains encoder_input, decoder_input, decoder_target_output for training the model.
        val_data: contains encoder_input, decoder_input, decoder_target_output for evaluating the model.
    '''

    assert train_size + val_size == 1, "Train, Validation and Test size doesn't sum up to 1!"

    data_size = enc_input[0].shape[0]

    # Summarize the source language token ids and the decoder input as: model_input
    model_input = Dataset.from_tensor_slices((enc_input[0], dec_input[0]))
    #                                         enc_token_ids dec_token_ids

    # convert decoder_target_output to TF.Dataset
    decoder_target_output = Dataset.from_tensor_slices((dec_input[1]))
    #                                            dec_token_ids used as target output (shifted by one observation)

    # Combine the model_input and the decoder_target_output to a full TF.Dataset, shuffle it
    full_data = Dataset.zip(
        (model_input, decoder_target_output)).shuffle(data_size)

    # Train Val split
    train_size = int(train_size * data_size)
    val_size = int(val_size * data_size)

    train_data = full_data.take(train_size)
    val_data = full_data.skip(train_size)

    train_data = train_data.batch(batch_size, drop_remainder=True)
    val_data = val_data.batch(batch_size, drop_remainder=True)

    return train_data, val_data
示例#8
0
def ds_rndm() -> Tuple[Dataset, Dataset, int, int, int]:
    # Hardcoded values taken from MNIST
    num_classes = 10
    m_train = 60000
    m_test = 10000
    # Random noise
    ds_image = Dataset.from_tensor_slices(
        (tf.random_uniform([m_train, 28, 28, 1], maxval=255, dtype=tf.int32)))
    ds_label = Dataset.from_tensor_slices((tf.random_uniform([m_train],
                                                             maxval=9,
                                                             dtype=tf.int64)))
    ds_train = Dataset.zip((ds_image, ds_label))
    ds_test = ds_train.take(m_test)

    return ds_train, ds_test, num_classes, m_train, m_test
示例#9
0
    def run(self, n_iterations=1):
        for itr in range(n_iterations):
            samples = self.sampler.partial_rollout(max_steps=self.T)

            # Using zip instead of from_tensor_slices because the latter needs
            # all tensors to have the same type, and actions may be integers
            self.train.dataset = Dataset.zip((
                    Dataset.from_tensor_slices(samples['observations']),
                    Dataset.from_tensor_slices(samples['actions']),
                    Dataset.from_tensor_slices(samples['advantages']),
                    Dataset.from_tensor_slices(samples['value_targets'])
            )).batch(self.batch_size)

            print('Training...')
            self.train.run(self.n_epochs)
            self.run_callbacks('post-iteration')
            self.old_policy.net.set_weights(self.policy.net.get_weights())
            self.sampler.send('policy_params', self.policy.net.get_weights())
            self.sampler.send('value_fn_params', self.value_fn.get_weights())
    def test_pipeline(self, num_threads):
        real_fname = os.path.join(self.dataset_path, 'test_real.txt')

        # extract directories
        real_dir, inst_dir = self.real_dir, self.inst_dir

        # count lines
        num_real = count_lines(real_fname)

        # dataset creation
        with tf.name_scope('dataset'):
            real = TextLineDataset(real_fname)

            # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat
            #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1))
            #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio)))

            real = real.shuffle(num_real) # no repetition! .repeat()

            # real data only
            augment = 0 # self.params.get('augment', 0)
            def name2real(name):
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                if augment:
                    src_dir = self.params.get('augment_src', 'best')
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False)
                    pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt'))
                    if isinstance(src_dir, float):
                        pnts *= src_dir
                    self.params['augment_scale'] = 0.
                    real = random_crop(full, pnts, self.params)
                else:
                    real = read_image(os.path.join(real_dir, '160x160', 'gray', name.decode() + '.jpg'))
                return real, inst, name.decode()
            real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32, tf.string])), num_parallel_calls = num_threads)

            #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real))
            dataset = Dataset.zip({ 'real': real })
            dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches!
            dataset = dataset.prefetch(self.batch_size * 2)
            return dataset
def input(dataset, mode, params, genre=None):
    uid_max = FLAGS.uid_max
    parse_py_fn_ = lambda line: parse_py_fn(line, mode)
    if mode.startswith('train'):
        dataset = dataset.map(
            lambda line: tf.py_func(parse_py_fn_, [line], tf.float32))
    else:
        dataset = dataset.map(lambda line: tf.py_func(
            parse_py_fn_, [line], [tf.float32, tf.float32]))

    if mode.endswith("genre"):
        dataset = dt.zip((dataset, genre))

    dataset = dataset.cache()
    if mode.startswith('train'):
        dataset = dataset.shuffle(buffer_size=100 * params['batch_size'])
    dataset = dataset.batch(params['batch_size'])
    if mode.startswith('train'):
        dataset = dataset.repeat(params['repeat_times'])
    else:
        dataset = dataset.repeat()
        pass
    iterator = dataset.make_one_shot_iterator()
    if mode.startswith('train'):
        if mode.endswith("genre"):
            mat, genre_mat = iterator.get_next()
            return mat, genre_mat
        else:
            mat = iterator.get_next()
            return mat
    else:
        if mode.endswith("genre"):
            mat, genre_mat = iterator.get_next()
            mat1, mat2 = mat
            return mat1, mat2, genre_mat
        else:
            mat1, mat2 = iterator.get_next()
            return mat1, mat2
示例#12
0
 def dataset(self):
     return Dataset.zip((self.images, self.labels))
    def pipeline(self, name, num_threads):
        if not self.params.get('training', 1):
            return None
        synt_fname = os.path.join(self.dataset_path, name + '_synt.txt')
        real_fname = os.path.join(self.dataset_path, name + '_real.txt')
        unsup_fname = os.path.join(self.dataset_path, 'train_unsup.txt')

        num_synt, num_real, num_unsup = [count_lines(fname) for fname in [synt_fname, real_fname, unsup_fname]]
        ratio = num_synt / float(num_real)

        # extract directories
        fake_dirs, real_dir, inst_dir = self.fake_dirs, self.real_dir, self.inst_dir

        # dataset creation
        with tf.name_scope('dataset'):
            synt, real, unsup = [TextLineDataset(name) for name in [synt_fname, real_fname, unsup_fname]]

            # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat
            #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1))
            #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio)))

            synt = synt.shuffle(num_synt).repeat()
            real = real.shuffle(num_real).repeat()
            unsup = unsup.shuffle(num_unsup).repeat()

            # map to corresonding files
            # synthetic data
            def name2synt(name):
                fakes = [
                    read_image(os.path.join(path, name.decode() + '.jpg'))
                    for path in fake_dirs.values()
                ]
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                return fakes + [inst]

            synt_types = [tf.float32 for _ in self.fakes] + [tf.int32]
            synt = synt.map(lambda name: tf.py_func(name2synt, [name], synt_types), num_parallel_calls = num_threads)

            # real data
            augment = self.params.get('augment', 1)
            def name2real(name):
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                if augment:
                    src_dir = self.params.get('augment_src', 'best')
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False)
                    pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt'))
                    if isinstance(src_dir, float):
                        pnts *= src_dir
                    real = random_crop(full, pnts, self.params)
                    # TODO add mirror augmentation
                else:
                    real = read_image(os.path.join(real_dir, name.decode() + '.jpg'))
                return real, inst
            real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32])), num_parallel_calls = num_threads)

            # unsup data
            def name2unsup(name):
                if augment:
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    img = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg'), False)
                    imsz = img.shape # y,x,c
                    # [TL, TR, BR, BL]
                    real = random_crop(img, 
                            np.array([[5,5],[imsz[1]-5,5],[imsz[1]-5,imsz[0]-5],[5,imsz[0]-5]], dtype = np.float32), self.params)
                else:
                    real = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg'))
                return real

#             unsup = unsup.map(lambda name: tuple(tf.py_func(name2unsup, [name], [tf.float32])), num_parallel_calls = num_threads)

            # zip all, batch and prefetch
            #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real))
            dataset = Dataset.zip({ 'synt': synt, 'real': real }) # , 'unsup': unsup
            dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches!
            dataset = dataset.prefetch(self.batch_size * 2)
            return dataset
示例#14
0
    ict = tf.reshape(ict, [79, 159, 5])
    return ict


source_data = tr_data.map(load_tensor)

# Same from the 6-hours ahead data
input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred+6h/" +
                   "20CR2c/air.2m/training/") % os.getenv('SCRATCH'))
t2m_files = glob("%s/*.tfd" % input_file_dir)
n_steps = len(t2m_files)
tr_tfd = tf.constant(t2m_files)
tr_data = Dataset.from_tensor_slices(tr_tfd).repeat(n_epochs)
target_data = tr_data.map(load_tensor)

tr_data = Dataset.zip((source_data, target_data))
tr_data = tr_data.shuffle(buffer_size).batch(batch_size)

# Same for the test dataset
input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred/" +
                   "20CR2c/air.2m/test/") % os.getenv('SCRATCH'))
t2m_files = glob("%s/*.tfd" % input_file_dir)
test_steps = len(t2m_files)
test_tfd = tf.constant(t2m_files)
test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs)
test_source = test_data.map(load_tensor)
input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred+6h/" +
                   "20CR2c/air.2m/test/") % os.getenv('SCRATCH'))
t2m_files = glob("%s/*.tfd" % input_file_dir)
test_steps = len(t2m_files)
test_tfd = tf.constant(t2m_files)
示例#15
0
#  source with data from (window_size-1):(len-forecast_steps)
# Each source datset of length window_size
#  target with data from (forecast_steps+window_size-1):len
# Each target dataset of length 1
source_tfd = tf.constant(
    training_files[(window_size - 1):(len(training_files) - forecast_steps)])
source_data = Dataset.from_tensor_slices(source_tfd)
source_data = source_data.repeat(n_epochs)
source_data = source_data.map(load_tensor_window)
target_tfd = tf.constant(training_files[(forecast_steps + window_size -
                                         1):(len(training_files))])
target_data = Dataset.from_tensor_slices(target_tfd)
target_data = target_data.repeat(n_epochs)
target_data = target_data.map(load_tensor)
# Zip these together into (source,target) tuples for model fitting.
training_data = Dataset.zip((source_data, target_data))
training_data = training_data.batch(n_batch)

# Repeat the whole process with the test data
test_file_dir = (
    ("%s/Machine-Learning-experiments/datasets/" + "DWR/20CR2c/prmsl/test/") %
    os.getenv('SCRATCH'))
test_files = glob("%s/*.tfd" % test_file_dir)
test_steps = len(test_files) // n_batch
source2_tfd = tf.constant(test_files[(window_size - 1):(len(training_files) -
                                                        forecast_steps)])
source2_data = Dataset.from_tensor_slices(source2_tfd)
source2_data = source2_data.repeat(n_epochs)
source2_data = source2_data.map(load_tensor_window_test)
target2_tfd = tf.constant(test_files[(forecast_steps + window_size -
                                      1):(len(training_files))])
示例#16
0
    ax_test[i, 1].imshow(zebra)
plt.show()

# Putting all together
generator_G = get_resnet_generator(name='generator_G')
generator_F = get_resnet_generator(name='generator_F')

discriminator_X = get_discriminator(name='discriminator_X')
discriminator_Y = get_discriminator(name='discriminator_Y')

cycle_model = CycleGAN(generator_G=generator_G,
                       generator_F=generator_F,
                       discriminator_X=discriminator_X,
                       discriminator_Y=discriminator_Y)

cycle_model.compile(generator_G_opt=Adam(learning_rate=2e-4, beta_1=0.5),
                    generator_F_opt=Adam(learning_rate=2e-4, beta_1=0.5),
                    discriminator_X_opt=Adam(learning_rate=2e-4, beta_1=0.5),
                    discriminator_Y_opt=Adam(learning_rate=2e-4, beta_1=0.5),
                    generator_loss_fn=generator_loss_fn,
                    discriminator_loss_fn=discriminator_loss_fn)

plotter = GANMonitor(data=test_horses)
checkpoint_filepath = "./model_checkpoints/cyclegan_checkpoints.{epoch:03d}"
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath)

cycle_model.fit(
    Dataset.zip((train_horses, train_zebras)),
    epochs=90,
    callbacks=[plotter, model_checkpoint_callback],
)
    file_name = tf.strings.regex_replace(file_name, 'prmsl', 'air.2m')
    sict = tf.read_file(file_name)
    t2m = tf.parse_tensor(sict, numpy.float32)
    t2m = tf.reshape(t2m, [79, 159, 1])
    file_name = tf.strings.regex_replace(file_name, 'air.2m', 'z500')
    sict = tf.read_file(file_name)
    prate = tf.parse_tensor(sict, numpy.float32)
    prate = tf.reshape(prate, [79, 159, 1])
    ict = tf.concat([prmsl, t2m, prate], 2)  # Now [79,159,3]
    ict = tf.reshape(ict, [79, 159, 3])
    return ict


tr_data = tr_data.map(load_tensor)
tr_data = tr_data.shuffle(buffer_size).batch(batch_size)
tr_data = Dataset.zip((tr_data, tr_data))

# Same for the test dataset
input_file_dir = (("%s/Machine-Learning-experiments/datasets/rotated_pole/" +
                   "20CR2c/prmsl/test/") % os.getenv('SCRATCH'))
prmsl_files = glob("%s/*.tfd" % input_file_dir)
test_steps = len(prmsl_files)
test_tfd = tf.constant(prmsl_files)
test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs)
test_data = test_data.map(load_tensor)
test_data = test_data.batch(batch_size)
test_data = Dataset.zip((test_data, test_data))


# reparameterization trick
# instead of sampling from Q(z|X), sample eps = N(0,I)
TRAIN_SLICES = len(train_labels)
VAL_SLICES = len(val_labels)
TEST_SLICES = len(test_labels)
print('\nFinished splitting each image within each list into {} 1D slices.'
      .format(SLICES_PER_IMAGE))
# Note the slices are now normalised

train_slices_ds = Dataset.from_tensor_slices(train_slices)
val_slices_ds = Dataset.from_tensor_slices(val_slices)
test_slices_ds = Dataset.from_tensor_slices(test_slices)
print('\nCompleted slice datasets.')
train_labels_ds = Dataset.from_tensor_slices(train_labels)
val_labels_ds = Dataset.from_tensor_slices(val_labels)
test_labels_ds = Dataset.from_tensor_slices(test_labels)
print('Completed label datasets.')
train_ds = Dataset.zip((train_slices_ds, train_labels_ds))
val_ds = Dataset.zip((val_slices_ds, val_labels_ds))
test_ds = Dataset.zip((test_slices_ds, test_labels_ds))
print('\nCompleted datasets of labelled slices.')

AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 256
train_ds = train_ds.shuffle(TRAIN_SLICES).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds = val_ds.shuffle(VAL_SLICES).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)
print('\nFinished batching and shuffling datasets.')


def build_CNN(train_ds, val_ds, test_ds):
    """Function to build a convolutional neural network with 2 convolutions and
       1 dense layer.
    mnth = tf.strings.substr(fdte,5,2)
    dy   = tf.strings.substr(fdte,8,2)
    dy = tf.cond(tf.math.equal(mnth+dy,'0229'),
                               lambda: tf.constant('28'),lambda: dy)
    file_name=(tf.strings.substr(file_name,0,tf.strings.length(file_name)-17)+
                '1969-'+mnth+'-'+dy+tf.strings.substr(fdte,tf.strings.length(fdte)-7,7))
    sict  = tf.read_file(file_name)
    insol = tf.parse_tensor(sict,numpy.float32)
    insol = tf.reshape(insol,[79,159,1])
    ict = tf.concat([t2m,prmsl,uwnd,vwnd,insol],2) # Now [79,159,5]
    ict = tf.reshape(ict,[79,159,5])
    return ict

tr_source = tr_data.map(load_tensor_w_insol)

tr_data = Dataset.zip((tr_source, tr_target))
tr_data = tr_data.shuffle(buffer_size).batch(batch_size)

# Same for the test dataset
input_file_dir=(("%s/Machine-Learning-experiments/datasets/uk_centred/" +
                "20CR2c/air.2m/test/") %
                   os.getenv('SCRATCH'))
t2m_files=glob("%s/*.tfd" % input_file_dir)
test_steps=len(t2m_files)
test_tfd = tf.constant(t2m_files)
test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs)
test_target = test_data.map(load_tensor)
test_source = test_data.map(load_tensor_w_insol)
test_data = Dataset.zip((test_source, test_target))
test_data = test_data.batch(batch_size)
It is the most recent data and will be most analagous to making
future predictions given we're not collecting more data.
Since we have millions of data points and most of them are not recent
(thus possibly not taking into account current trends) taking this
relatively small slice should be fine.
"""
combine = 5  # should match param from preprocess
window = int(1440 / combine)
print(data.shape[0])
data = Dataset.from_tensor_slices(data).window(window, 1, combine, True)
data = data.flat_map(lambda x: x.batch(window, drop_remainder=True))
print(data)
labels = Dataset.from_tensor_slices(datalabels)
prices = Dataset.from_tensor_slices(dataprices)
ratios = Dataset.from_tensor_slices(dataratios)
ins = Dataset.zip((data, prices))
outs = Dataset.zip((ratios, prices))
data = Dataset.zip((ins, outs))
"""
Choose a window stride number for training that does not have a common factor
with minutes in a day (1440) so we can pinstripe through every day while going
through the whole timescale, but also still pick many per day. I picked 29.
Repeat is done before our pinstriping window so it will roll over the end and
pinstripe the year as well as long as it not a factor in our data size either
(which it is not for the bitstamp set nor the slightly truncated one due to
the first 1440 size window).

We'll split off the last month for testing and the two months before that for
validation.
"""
val_test_count = 8760 * 3
示例#21
0
def run(dpath,
        img_size=160,
        epochs=10,
        batch_size=32,
        learning_rate=0.0001,
        output='model',
        dset=None):

    global g_image_size
    g_image_size = img_size
    img_shape = (img_size, img_size, 3)

    info('Loading Data Set')
    # load dataset
    train = load_dataset(dpath, dset)

    # training data
    train_data, train_labels = zip(*train)
    train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
                            Dataset.from_tensor_slices(list(train_labels)),
                            Dataset.from_tensor_slices(
                                [img_size] * len(train_data))))  # noqa: E501

    print(train_ds)
    train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5)

    train_ds = train_ds.apply(tf.data.experimental.ignore_errors())

    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(buffer_size=5)
    train_ds = train_ds.repeat()

    # model
    info('Creating Model')
    base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,
                                                   include_top=False,
                                                   weights='imagenet')
    base_model.trainable = True

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.summary()

    # training
    info('Training')
    steps_per_epoch = math.ceil(len(train) / batch_size)
    mlflow.tensorflow.autolog()
    model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)

    # Log metric
    # TODO calculate metric from based on evalution data.
    # accuracy = model.evaluate()
    accuracy = random()  # dummy score
    metric = {
        'name': 'accuracy-score',
        'numberValue': accuracy,
        'format': "PERCENTAGE",
    }
    metrics = {  # [doc] https://www.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/  # noqa: E501
        'metrics': [metric]
    }

    # TODO
    # It would be nice to refactor all this infra code below like logging, saving files,  # noqa: E501
    # out of this method so it just does the training and returns the model along with metrics  # noqa: E501

    # Log to mlflow
    mlflow.log_metrics({"accuracy": accuracy})

    # Pipeline Metric
    info('Writing Pipeline Metric')
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)

    # save model
    info('Saving Model')

    # check existence of base model folder
    output = check_dir(output)

    print('Serializing into saved_model format')
    tf.saved_model.save(model, str(output))
    print('Done!')

    # add time prefix folder
    file_output = str(Path(output).joinpath('latest.h5'))
    print('Serializing h5 model to:\n{}'.format(file_output))
    model.save(file_output)
    # mlflow.log_artifact(file_output)

    return generate_hash(file_output, 'kf_pipeline')
示例#22
0
def run(dpath,
        img_size=160,
        epochs=10,
        batch_size=32,
        learning_rate=0.0001,
        output='model',
        dset=None):

    global g_image_size
    g_image_size = img_size
    img_shape = (img_size, img_size, 3)

    info('Loading Data Set')
    train = load_dataset(dpath, dset)
    train_data, train_labels = zip(*train)
    train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
                            Dataset.from_tensor_slices(list(train_labels)),
                            Dataset.from_tensor_slices(
                                [img_size] * len(train_data))))  # noqa: E501

    print(train_ds)
    train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5)

    train_ds = train_ds.apply(tf.data.experimental.ignore_errors())
    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(buffer_size=5)
    train_ds = train_ds.repeat()

    info('Creating Model')
    base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,
                                                   include_top=False,
                                                   weights='imagenet')
    base_model.trainable = True

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.summary()

    info('Training')
    steps_per_epoch = math.ceil(len(train) / batch_size)
    mlflow.tensorflow.autolog()
    model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)

    # Log metric
    accuracy = random()  # dummy score
    metric = {
        'name': 'accuracy-score',
        'numberValue': accuracy,
        'format': "PERCENTAGE",
    }
    metrics = {'metrics': [metric]}

    mlflow.log_metrics({"accuracy": accuracy})
    info('Writing Pipeline Metric')
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)
    info('Saving Model')
    output = check_dir(output)

    print('Serializing into saved_model format')
    tf.saved_model.save(model, str(output))
    print('Done!')

    file_output = str(Path(output).joinpath('latest.h5'))
    print('Serializing h5 model to:\n{}'.format(file_output))
    model.save(file_output)

    return generate_hash(file_output, 'kf_pipeline')
示例#23
0
    return ict


obs_data = Dataset.from_tensor_slices(train_tfd)
obs_data = obs_data.repeat(n_epochs)
obs_data = obs_data.map(load_observations)
obs_data = obs_data.batch(1)

# And the test observations
obs_test_data = Dataset.from_tensor_slices(test_tfd)
obs_test_data = obs_test_data.repeat(n_epochs)
obs_test_data = obs_test_data.map(load_observations)
obs_test_data = obs_test_data.batch(1)

# Zip the target and source together for training
training_data = Dataset.zip((obs_data, field_data))
test_data = Dataset.zip((obs_test_data, field_test_data))


# Need to resize data so it's dimensions are a multiple of 8 (3*2-fold pool)
class ResizeLayer(tf.keras.layers.Layer):
    def __init__(self, newsize=None, **kwargs):
        super(ResizeLayer, self).__init__(**kwargs)
        self.resize_newsize = newsize

    def call(self, input):
        return tf.image.resize_images(input,
                                      self.resize_newsize,
                                      align_corners=True)

    def get_config(self):