예제 #1
0
def train_discrete_voxel_vae(config, kwargs, num_epochs=100):
    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    dataset = build_dataset()

    # drop the image as the model expects only graphs
    dataset = dataset.map(lambda graphs, images: (graphs, ))

    # run on first input to set variable shapes
    for batch in iter(dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=dataset,
                          num_epochs=num_epochs,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          debug=False)
    return train_one_epoch.model, checkpoint_dir
예제 #2
0
def main(data_dir, config):
    # Make strategy at the start of your main before any other tf code is run.
    strategy = get_distribution_strategy(use_cpus=True,
                                         logical_per_physical_factor=1)

    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    for (graph, img, c) in iter(test_dataset):
        print(graph)
        break

    with strategy.scope():
        train_one_epoch = build_training(**config)

    log_dir = build_log_dir('test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=3,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #3
0
def main(data_dir, config, kwargs):
    # Make strategy at the start of your main before any other tf code is run.
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1,
    #                                      memory_limit=None)

    train_dataset = build_dataset(os.path.join(data_dir, 'train'),
                                  batch_size=4)
    test_dataset = build_dataset(os.path.join(data_dir, 'test'), batch_size=4)

    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)
    train_one_epoch.model.set_temperature(10.)

    log_dir = build_log_dir('new_im_16_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('new_im_16_checkpointing', config)
    save_model_dir = os.path.join('new_im_16_saved_models')

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=100,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=save_model_dir,
                          debug=False)
예제 #4
0
def train_disc_img_vae(data_dir, config, kwargs):
    # strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1)

    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    print(f'Number of training tfrecord files : {len(train_tfrecords)}')
    print(f'Number of test tfrecord files : {len(test_tfrecords)}')
    print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}')

    train_dataset = build_dataset(train_tfrecords, batch_size=4)
    test_dataset = build_dataset(test_tfrecords, batch_size=4)

    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    # log_dir = build_log_dir('test_log_dir', config)
    # checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)
    log_dir = 'test_log_dir'
    checkpoint_dir = 'test_checkpointing'
    save_dir = 'saved_model'

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=save_dir,
                          debug=True)
예제 #5
0
def train_identify_medium(data_dir, config):
    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    print(f'Number of training tfrecord files : {len(train_tfrecords)}')
    print(f'Number of test tfrecord files : {len(test_tfrecords)}')
    print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}')

    train_dataset = build_dataset(train_tfrecords)
    test_dataset = build_dataset(test_tfrecords)

    train_dataset = batch_dataset_set_graph_tuples(all_graphs_same_size=True,
                                                   dataset=train_dataset,
                                                   batch_size=32)
    test_dataset = batch_dataset_set_graph_tuples(all_graphs_same_size=True,
                                                  dataset=test_dataset,
                                                  batch_size=32)

    train_one_epoch = build_training(**config)

    log_dir = build_log_dir('test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=20,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #6
0
def main(data_dir, config, kwargs):
    # Make strategy at the start of your main before any other tf code is run.
    strategy = get_distribution_strategy(use_cpus=True,
                                         logical_per_physical_factor=1)

    train_dataset = build_dataset(os.path.join(data_dir, 'train'),
                                  batch_size=4)
    test_dataset = build_dataset(os.path.join(data_dir, 'test'), batch_size=4)

    # for (graph, positions) in iter(test_dataset):
    #     print(graph)
    #     break

    with strategy.scope():
        train_one_epoch = build_training(**config, **kwargs)
    train_one_epoch.model.set_beta(0.)

    log_dir = build_log_dir('test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    save_model_dir = os.path.join('saved_models')

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=save_model_dir,
                          debug=False)
예제 #7
0
def train_auto_regressive_prior(config, kwargs, num_epochs=100):
    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    dataset = build_dataset()

    # run on first input to set variable shapes
    for batch in iter(dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    exclude_variables = [variable.name for variable in kwargs['discrete_image_vae'].trainable_variables] \
                        + [variable.name for variable in kwargs['discrete_voxel_vae'].trainable_variables]
    trainable_variables = list(
        filter(lambda variable: (variable.name not in exclude_variables),
               train_one_epoch.model.trainable_variables))

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=dataset,
                          num_epochs=num_epochs,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          variables=trainable_variables,
                          debug=False)
    return train_one_epoch.model, checkpoint_dir
예제 #8
0
def main(data_dir, batch_size, config, kwargs):
    # Make strategy at the start of your main before any other tf code is run.
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1,
    #                                      memory_limit=None)
    strategy = None

    if strategy is not None:
        train_dataset = build_distributed_dataset(os.path.join(
            data_dir, 'train'),
                                                  global_batch_size=batch_size,
                                                  strategy=strategy)
        test_dataset = build_distributed_dataset(os.path.join(
            data_dir, 'test'),
                                                 global_batch_size=batch_size,
                                                 strategy=strategy)
    else:
        train_dataset = build_dataset(os.path.join(data_dir, 'train'),
                                      batch_size=batch_size)
        test_dataset = build_dataset(os.path.join(data_dir, 'test'),
                                     batch_size=batch_size)

    # for (graph, positions) in iter(test_dataset):
    #     print(graph)
    #     break

    if strategy is not None:
        with strategy.scope():
            train_one_epoch = build_training(**config,
                                             **kwargs,
                                             strategy=strategy)
    else:
        train_one_epoch = build_training(**config, **kwargs, strategy=strategy)

    train_one_epoch.model.set_temperature(10.)
    train_one_epoch.model.set_beta(6.6)

    log_dir = build_log_dir('simple_complete_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('simple_complete_checkpointing',
                                          config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    save_model_dir = os.path.join('simple_complete_saved_models')

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=100,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=save_model_dir,
                          debug=False)
예제 #9
0
def train_VQVAE(data_dir):
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000)

    # lists containing tfrecord files
    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    train_dataset = build_dataset(train_tfrecords)
    test_dataset = build_dataset(test_tfrecords)

    train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)
    test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)

    # with strategy.scope():
    autoencoder_depth = 6
    model = VectorQuantizerVariationalAutoEncoder(embedding_dim=2 *
                                                  2**autoencoder_depth,
                                                  num_embeddings=1024,
                                                  kernel_size=4,
                                                  num_layers=autoencoder_depth,
                                                  num_residual_layers=2)

    learning_rate = 1e-6
    opt = snt.optimizers.Adam(learning_rate)

    def loss(model_outputs, batch):
        (img, ) = batch
        vq_loss, decoded_img = model_outputs
        print('im shape', img.shape)
        print('dec im shape', decoded_img.shape)
        # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(
        #     keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2)
        #         ))
        reconstruction_loss = tf.reduce_mean(
            (gaussian_filter2d(img, filter_shape=[6, 6]) -
             decoded_img[:, 12:-12, 12:-12, :])**2)
        total_loss = reconstruction_loss + vq_loss
        return total_loss

    train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None)

    log_dir = 'vqvae2_log_dir'
    checkpoint_dir = 'vqvae2_checkpointing'

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=50,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #10
0
def train_VQVAE(data_dir):
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000)

    # lists containing tfrecord files
    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)
    test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)

    # with strategy.scope():
    model = VectorQuantizerVariationalAutoEncoder(embedding_dim=32,
                                                  num_embeddings=1024,
                                                  kernel_size=4)

    learning_rate = 1e-4
    opt = snt.optimizers.Adam(learning_rate)

    def loss(model_outputs, batch):
        (img, ) = batch
        vq_loss, decoded_img = model_outputs
        print('im shape', img.shape)
        print('dec im shape', decoded_img.shape)
        # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(
        #     keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2)
        #         ))
        reconstruction_loss = tf.reduce_mean(
            (img - decoded_img[:, :, :, :])**2)
        tf.summary.scalar('reconstruction loss',
                          reconstruction_loss,
                          step=model.step)
        tf.summary.scalar('vq_loss', vq_loss, step=model.step)
        total_loss = reconstruction_loss + vq_loss
        return total_loss

    train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None)

    log_dir = 'VQVAE_log_dir_16_1024'
    checkpoint_dir = 'VQVAE_checkpointing_16_1024'
    model_dir = 'trained_VAE_model_16_1024'

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=10000,
                          early_stop_patience=10000,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False,
                          save_model_dir=model_dir)
예제 #11
0
def train_ae_3d(data_dir, config):
    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    print(f'Number of training tfrecord files : {len(train_tfrecords)}')
    print(f'Number of test tfrecord files : {len(test_tfrecords)}')
    print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}')

    train_dataset = build_dataset(train_tfrecords)
    test_dataset = build_dataset(test_tfrecords)

    train_one_epoch = build_training(**config)

    log_dir = build_log_dir('test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)
    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    # checkpoint = tf.train.Checkpoint(module=train_one_epoch)
    # manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3,
    #                                      checkpoint_name=train_one_epoch.model.__class__.__name__)
    #
    # if manager.latest_checkpoint is not None:
    #     checkpoint.restore(manager.latest_checkpoint)
    #     print(f"Restored from {manager.latest_checkpoint}")
    # output_dir = './output_evaluations'
    # os.makedirs(output_dir, exist_ok=True)
    #
    # property_names = ['vx','vy','vz','rho','U','mass','smoothing_length']
    # for i, test_graph in enumerate(iter(test_dataset)):
    #     input_properties = test_graph.nodes[:,3:].numpy()
    #     reconstructed_graph = train_one_epoch.model(test_graph)
    #     decoded_properties = reconstructed_graph.nodes.numpy()
    #     positions = test_graph.nodes[:,:3].numpy()
    #     save_dict = dict(positions=positions)
    #     for j in range(len(property_names)):
    #         save_dict[f"prop_{property_names[j]}_input"] = input_properties[:, j]
    #         save_dict[f"prop_{property_names[j]}_decoded"] = decoded_properties[:, j]
    #     np.savez(os.path.join(output_dir,'test_example_{:04d}.npz'.format(i)), **save_dict)
    #     if i == 20:
    #         break

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=100,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #12
0
def train_variational_autoencoder(data_dir):
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000)

    # lists containing tfrecord files
    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    train_dataset = build_dataset(train_tfrecords)
    test_dataset = build_dataset(test_tfrecords)

    train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)
    test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)

    # with strategy.scope():
    model = VariationalAutoEncoder(n_latent=4, kernel_size=4)

    learning_rate = 1e-3
    opt = snt.optimizers.Adam(learning_rate)

    def loss(model_outputs, batch):
        (img, ) = batch
        mn, std, z, decoded_img = model_outputs
        # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(
        #     keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2)
        #         ))
        reconstruction_loss = tf.reduce_mean(
            (gaussian_filter2d(img, filter_shape=[6, 6]) -
             decoded_img[:, 12:-12, 12:-12, :])**2)
        kl_loss = -0.5 * (1 + std - tf.square(mn) - tf.exp(std))
        kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
        total_loss = reconstruction_loss + kl_loss
        print(f"recon_loss = {reconstruction_loss}")
        print(f"kl_loss = {kl_loss}")
        return total_loss

    train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None)

    log_dir = 'VAE_log_dir'
    checkpoint_dir = 'VAE_checkpointing'

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=50,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #13
0
def train_auto_regressive_prior(data_dir, batch_size, config, kwargs, num_epochs=100):
    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    # dataset = build_example_dataset(1000, batch_size=2, num_blobs=5, num_nodes=64 ** 3, image_dim=256)
    dataset = build_dataset(data_dir, batch_size)

    # the model will call grid_graphs internally to learn the 3D autoencoder.
    # we show here what that produces from a batch of graphs.
    # for graphs, image in iter(dataset):
    #     assert image.numpy().shape == (2, 256, 256, 1)
    #     plt.imshow(image[0,...,0].numpy())
    #     plt.colorbar()
    #     plt.show()
    #     voxels = grid_graphs(graphs, 64)
    #     assert voxels.numpy().shape == (2, 64, 64, 64, 1)
    #     plt.imshow(tf.reduce_mean(voxels[0,...,0], axis=-1))
    #     plt.colorbar()
    #     plt.show()
    #     break

    # run on first input to set variable shapes
    for batch in iter(dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    exclude_variables = [variable.name for variable in kwargs['discrete_image_vae'].trainable_variables] \
                        + [variable.name for variable in kwargs['discrete_voxel_vae'].trainable_variables]
    trainable_variables = list(filter(lambda variable: (variable.name not in exclude_variables),
                                      train_one_epoch.model.trainable_variables))

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=dataset,
                          num_epochs=num_epochs,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          variables=trainable_variables,
                          debug=False)
예제 #14
0
def train_discrete_voxel_vae(config, kwargs):
    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    dataset = build_example_dataset(100,
                                    batch_size=2,
                                    num_blobs=3,
                                    num_nodes=64**3,
                                    image_dim=256)

    # the model will call grid_graphs internally to learn the 3D autoencoder.
    # we show here what that produces from a batch of graphs.
    for graphs, image in iter(dataset):
        assert image.numpy().shape == (2, 256, 256, 1)
        plt.imshow(image[0].numpy())
        plt.colorbar()
        plt.show()
        voxels = grid_graphs(graphs, 64)
        assert voxels.numpy().shape == (2, 64, 64, 64, 1)
        plt.imshow(tf.reduce_mean(voxels[0], axis=-2))
        plt.colorbar()
        plt.show()
        break

    # drop the image as the model expects only graphs
    dataset = dataset.map(lambda graphs, images: (graphs, ))

    # run on first input to set variable shapes
    for batch in iter(dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=dataset,
                          num_epochs=1,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          debug=False)
예제 #15
0
def train_discrete_image_vae(data_dirs,
                             config,
                             model_kwargs,
                             batch_size=1,
                             num_epochs=100):
    print('\n')

    train_one_epoch = build_training(**config, **model_kwargs)
    train_dataset = build_dataset(data_dirs,
                                  batch_size=batch_size,
                                  train_test_dir='train')
    test_dataset = build_dataset(data_dirs,
                                 batch_size=batch_size,
                                 train_test_dir='test')

    print(f'Number of epochs: {num_epochs}')
    print('Training discrete image VAE\n')

    # drop the graph as the model expects only images
    train_dataset = train_dataset.map(lambda voxels, images: (images, ))
    test_dataset = test_dataset.map(lambda voxels, images: (images, ))

    # run on first input to set variable shapes
    for batch in iter(train_dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=num_epochs,
                          early_stop_patience=20,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          variables=train_one_epoch.model.trainable_variables,
                          debug=False)

    return train_one_epoch.model, checkpoint_dir
예제 #16
0
def train_auto_regressive_prior(data_dirs,
                                config,
                                model_kwargs,
                                batch_size=1,
                                num_epochs=100):
    print('\n')

    train_one_epoch = build_training(**config, **model_kwargs)
    train_dataset = build_dataset(data_dirs,
                                  batch_size=batch_size,
                                  train_test_dir='train')
    test_dataset = build_dataset(data_dirs,
                                 batch_size=batch_size,
                                 train_test_dir='test')

    print(f'Number of epochs: {num_epochs}')
    print('Training autoregressive prior\n')

    # run on first input to set variable shapes
    for batch in iter(train_dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    exclude_variables = [variable.name for variable in model_kwargs['discrete_image_vae'].trainable_variables] + \
                        [variable.name for variable in model_kwargs['discrete_voxel_vae'].trainable_variables]
    trainable_variables = list(
        filter(lambda variable: (variable.name not in exclude_variables),
               train_one_epoch.model.trainable_variables))

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=num_epochs,
                          early_stop_patience=40,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          variables=trainable_variables,
                          debug=False)
예제 #17
0
def train_autoencoder(data_dir):
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000)

    # lists containing tfrecord files
    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    # print(f'Number of training tfrecord files : {len(train_tfrecords)}')
    # print(f'Number of test tfrecord files : {len(test_tfrecords)}')
    # print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}')
    #
    # train_dataset = build_dataset(train_tfrecords)
    # test_dataset = build_dataset(test_tfrecords)

    train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)
    test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)

    # with strategy.scope():
    model = AutoEncoder()

    learning_rate = 1.0e-5

    opt = snt.optimizers.Adam(learning_rate)

    def loss(model_outputs, batch):
        (img, ) = batch
        decoded_img = model_outputs
        # return tf.reduce_mean((gaussian_filter2d(img, filter_shape=[6, 6]) - decoded_img[:, :, :, :]) ** 2)
        return 100 * tf.reduce_mean((img - decoded_img[:, :, :, :])**2)

    train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None)

    log_dir = 'autoencoder_log_dir'
    checkpoint_dir = 'autoencoder_checkpointing'

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=1000,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #18
0
def train_discrete_image_vae(config, kwargs):
    # with strategy.scope():
    train_one_epoch = build_training(**config, **kwargs)

    dataset = build_example_dataset(10,
                                    batch_size=2,
                                    num_blobs=3,
                                    num_nodes=64**3,
                                    image_dim=256)

    # show example of image
    for graphs, image in iter(dataset):
        assert image.numpy().shape == (2, 256, 256, 1)
        plt.imshow(image[0].numpy())
        plt.colorbar()
        plt.show()
        break

    # drop the graph as the model expects only images
    dataset = dataset.map(lambda graphs, images: (images, ))

    # run on first input to set variable shapes
    for batch in iter(dataset):
        train_one_epoch.model(*batch)
        break

    log_dir = build_log_dir('log_dir', config)
    checkpoint_dir = build_checkpoint_dir('checkpointing', config)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=dataset,
                          num_epochs=1,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=checkpoint_dir,
                          variables=train_one_epoch.model.trainable_variables,
                          debug=False)
예제 #19
0
def test_vanillia_training_loop():
    import sonnet as snt

    class Model(AbstractModule):
        def __init__(self, name=None):
            super(Model, self).__init__(name=name)
            self.net = snt.nets.MLP([10, 1], activate_final=False)

        def _build(self, batch):
            (inputs, _) = batch
            return self.net(inputs)

    def loss(model_output, batch):
        (_, target) = batch
        return tf.reduce_mean((target - model_output)**2)

    dataset = tf.data.Dataset.from_tensor_slices((tf.random.normal(
        (100, 5)), tf.random.normal((100, 1)))).batch(10)

    training = TrainOneEpoch(Model(), loss, snt.optimizers.Adam(1e-4))
    vanilla_training_loop(dataset, training, 100, debug=False)
예제 #20
0
def train_autoencoder(data_dir):
    train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords'))
    test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords'))

    print(f'Number of training tfrecord files : {len(train_tfrecords)}')
    print(f'Number of test tfrecord files : {len(test_tfrecords)}')
    print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}')

    train_dataset = build_dataset(train_tfrecords)
    test_dataset = build_dataset(test_tfrecords)

    train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)
    test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch(
        batch_size=32)

    model = AutoEncoder(kernel_size=4)

    learning_rate = 1e-5
    opt = snt.optimizers.Adam(learning_rate)

    def loss(model_outputs, batch):
        (img, ) = batch
        decoded_img = model_outputs
        return tf.reduce_mean((gaussian_filter2d(img, filter_shape=[6, 6]) -
                               decoded_img[:, 12:-12, 12:-12, :])**2)

    train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None)

    log_dir = 'autoencoder_log_dir'
    checkpoint_dir = 'autoencoder_checkpointing'

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=50,
                          early_stop_patience=5,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #21
0
def train_identify_medium(data_dir, config):
    # Make strategy at the start of your main before any other tf code is run.
    strategy = get_distribution_strategy(use_cpus=False,
                                         logical_per_physical_factor=1,
                                         memory_limit=11000)

    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    print('\nEXAMPLE FROM TEST DATASET:')
    for (graph, img, c) in iter(train_dataset):
        print(img)
        print('max: ', tf.math.reduce_max(img))
        break

    with strategy.scope():
        train_one_epoch = build_training(**config)

    log_dir = build_log_dir('test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)

    if checkpoint_dir is not None:  # originally from vanilla_training_loop
        os.makedirs(checkpoint_dir, exist_ok=True)

    with open(os.path.join(checkpoint_dir, 'config.json'),
              'w') as f:  # checkpoint_dir not yet created
        json.dump(config, f)

    print('\nvanilla training loop...')
    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #22
0
def train_ae_3d(data_dir, config):
    # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000)

    # lists containing tfrecord files
    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    # train_dataset = train_dataset.map(lambda graph, img, c: (graph,))
    # test_dataset = test_dataset.map(lambda graph, img, c: (graph,))

    # for ds_item in iter(train_dataset):
    #     print(ds_item)
    #     break

    # for ds_item in iter(train_dataset):
    #     print(ds_item)
    #     br

    train_one_epoch = build_training(**config)

    log_dir = build_log_dir('new_test_log_dir', config)
    checkpoint_dir = build_checkpoint_dir('new_test_checkpointing', config)
    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    checkpoint = tf.train.Checkpoint(module=train_one_epoch)
    manager = tf.train.CheckpointManager(
        checkpoint,
        checkpoint_dir,
        max_to_keep=3,
        checkpoint_name=train_one_epoch.model.__class__.__name__)
    if manager.latest_checkpoint is not None:
        checkpoint.restore(manager.latest_checkpoint)
        print(f"Restored from {manager.latest_checkpoint}")

    # output_dir = './output_evaluations'
    # os.makedirs(output_dir, exist_ok=True)
    # names = ['vx', 'vy', 'vz', 'grav_potential', 'rho', 'temperature', 'cell_mass', 'cell_volume']
    # for i, test_graph in enumerate(iter(test_dataset)):
    #     positions = test_graph.nodes[:, :3].numpy()
    #     input_properties = test_graph.nodes[:, 3:].numpy()
    #
    #     decoded_graph = train_one_epoch.model(test_graph)
    #     decoded_properties = decoded_graph.nodes.numpy()
    #     save_dict = dict(positions=positions)
    #     for j in range(len(names)):
    #         save_dict[f"prop_{names[j]}_input"] = input_properties[:,j]
    #         save_dict[f"prop_{names[j]}_decoded"] = decoded_properties[:,j]
    #     np.savez(os.path.join(output_dir, 'test_evaluation_{:04d}.npz'.format(i)), **save_dict)
    #
    #     if i == 20:
    #         break

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=1000,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          debug=False)
예제 #23
0
def train_disc_graph_vae(data_dir, config):
    # strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1)

    # lists containing tfrecord files
    train_dataset = build_dataset(os.path.join(data_dir, 'train'))
    test_dataset = build_dataset(os.path.join(data_dir, 'test'))

    # train_dataset = train_dataset.map(lambda graph, img, c: (graph,))
    # test_dataset = test_dataset.map(lambda graph, img, c: (graph,))

    # for ds_item in iter(train_dataset):
    #     print(ds_item)
    #     break

    # for ds_item in iter(train_dataset):
    #     print(ds_item)
    #     br

    train_one_epoch = build_training(
        model_parameters=dict(
            encoder_fn=EncoderNetwork3D,
            decode_fn=DecoderNetwork3D,
            embedding_dim=4,  # 64
            num_embedding=4,  # 64
            num_gaussian_components=4,  # 128
            num_token_samples=1,
            num_properties=8,
            temperature=50.,
            beta=1.,
            encoder_kwargs=dict(
                inter_graph_connect_prob=0.01,
                reducer=tf.math.unsorted_segment_mean,
                starting_global_size=4,
                node_size=4,  # 64
                edge_size=4,
                crossing_steps=1,
                name=None),
            decode_kwargs=dict(
                inter_graph_connect_prob=0.01,
                reducer=tf.math.unsorted_segment_mean,
                starting_global_size=4,
                node_size=4,  # 64
                edge_size=4,
                crossing_steps=1,
                name=None),
            name=None),
        **config)

    # log_dir = build_log_dir('test_log_dir', config)
    # checkpoint_dir = build_checkpoint_dir('test_checkpointing', config)
    log_dir = 'dVAE_log_dir'
    checkpoint_dir = 'dVAE_checkpointing'
    model_dir = 'dVAE_model'

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f:
        json.dump(config, f)

    vanilla_training_loop(train_one_epoch=train_one_epoch,
                          training_dataset=train_dataset,
                          test_dataset=test_dataset,
                          num_epochs=100,
                          early_stop_patience=10,
                          checkpoint_dir=checkpoint_dir,
                          log_dir=log_dir,
                          save_model_dir=model_dir,
                          debug=False)