def main(argv=None):
    # Initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    KB.set_session(tf.Session(config=config))

    # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(),
    #                                                hvd.rank()))

    ngpus = hvd.size()

    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = _parser(desc)

    num_devices_tfrecord = 1
    height, width = 224, 224  # Image dimensions. Gets resized if not match.
    distort_color = args.distort_color
    data_dir = args.datadir
    batch_size = args.batch_size  # * ngpus
    epochs = args.epochs
    imgs_per_epoch = args.imgs_per_epoch

    # Fit the model using data from the TFRecord data tensors.
    device_minibatches = RecordInputImagenetPreprocessor.device_minibatches
    images_tfrecord, labels_tfrecord, nrecords = device_minibatches(
        num_devices_tfrecord,
        data_dir,
        batch_size,
        height,
        width,
        distort_color,
        val=False)
    images_tfrecord = images_tfrecord[0]
    labels_tfrecord = labels_tfrecord[0]

    # CASTING FOR KERAS
    # labels[device_num] = tf.cast(labels_tfrecord, dtype)
    nclasses = 1000
    labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses)

    nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords
    steps_per_epoch = nimgs_to_use // batch_size // hvd.size()
    # steps_per_epoch = 100

    # batch_shape = images_tfrecord.get_shape().as_list()
    # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape)
    images = Input(tensor=images_tfrecord)
    model = ResNet50(input_tensor=images, weights=None)
    if hvd.rank() == 0:
        model.summary()

        print('Num images: {}'.format(nrecords))

        if nimgs_to_use < nrecords:
            print('Using {} images per epoch'.format(nimgs_to_use))

        # print('IMAGES_TFRECORD: {}'.format(images_tfrecord))
        # print('LABELS_TFRECORD: {}'.format(labels_tfrecord))

    # Add Horovod Distributed Optimizer from nvcnn.py
    # momentum = 0.9
    # lr = 0.1
    # learning_rate = tf.train.exponential_decay(
    #             lr,
    #             self.global_step,
    #             decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch,
    #             decay_rate=FLAGS.lr_decay_rate,
    #             staircase=True)
    # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum,
    #                                  use_nesterov=True)

    # lr = 0.001 * ngpus
    # opt = tf.train.AdamOptimizer()
    # opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    # opt = KO.TFOptimizer(opt)  # Required for tf.train based optimizers

    opt = KO.Adam()
    opt = hvd_keras.DistributedOptimizer(opt)

    model.compile(
        loss='categorical_crossentropy',
        optimizer=opt,
        # metrics=['accuracy'],
        target_tensors=[labels_tfrecord])

    # Broadcast variables from rank 0 to all other processes.
    KB.get_session().run(hvd.broadcast_global_variables(0))

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(ngpus * batch_size)]

    # RecordInput is a yield op which doesn't use queue runners or queues.
    # Start the queue runners.
    # sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    model.fit(steps_per_epoch=steps_per_epoch,
              epochs=epochs,
              callbacks=callbacks,
              verbose=1)
    # verbose=hvd.rank() == 0)
    elapsed_time = time.time() - start_time

    if hvd.rank() == 0:
        print('[{}] finished in {} s'.format('TRAINING',
                                             round(elapsed_time, 3)))
        # loss = model.evaluate(None, None, steps=steps_per_epoch_val)

        images_tfrecord_val, labels_tfrecord_val, nrecords_val = \
            device_minibatches(num_devices_tfrecord, data_dir, batch_size,
                               height, width, distort_color, val=True)
        images_tfrecord_val = images_tfrecord_val[0]
        labels_tfrecord_val = labels_tfrecord_val[0]
        labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses)

        # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val))
        # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val))

        steps_per_epoch_val = nrecords_val // batch_size

        images_val = Input(tensor=images_tfrecord_val)
        model_val = model
        model_val.layers[0] = KL.InputLayer(input_tensor=images_val)
        model_val.compile(loss='categorical_crossentropy',
                          optimizer=opt,
                          metrics=['accuracy'],
                          target_tensors=[labels_tfrecord_val])
        # model.summary()
        loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val)

        print('\nNum images evaluated, steps: {}, {}'.format(
            nrecords_val, steps_per_epoch_val))
        print('\nTest loss, acc: {}'.format(loss))
        # print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    # coord.request_stop()
    # coord.join(threads)

    KB.clear_session()  # do this for Horovod
Exemplo n.º 2
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    # steps_per_epoch = train_samples // batch_size // hvdsize
    speedupopt = args.speedup
    if speedupopt == SpeedupOpts.imgspersec:
        steps_per_epoch = train_samples // batch_size
    else:
        steps_per_epoch = int(
            round(float(train_samples) / batch_size / hvdsize + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size,
                                    filters, num_conv, intermediate_dim,
                                    latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae = make_vae(ldict, x)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    # opt = RMSprop(lr)
    # Add Horovod Distributed Optimizer.
    # opt = hvd_keras.DistributedOptimizer(opt)  # , use_locking=True)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks)

    if hvd.rank() == 0:
        x = Input(shape=original_img_size)
        vae_val = make_vae(ldict, x)
        vae_val.compile(optimizer=opt, loss=None)
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        x = Input(shape=original_img_size)
        z_mean, _ = get_encoded(ldict, x)
        encoder = Model(x, z_mean)
        # :  :type encoder: Model

        decoder_input = Input(shape=(latent_dim, ))
        x_decoded_mean_squash = get_decoded(ldict, decoder_input)
        generator = Model(decoder_input, x_decoded_mean_squash)
        # :  :type generator: Model

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size:(i + 1) * digit_size,
                       j * digit_size:(j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    enqueue = args.enqueue
    usenccl = args.nccl
    syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    # (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which
    # was fixed in Keras 2.1.1
    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes).squeeze()
    y_test = to_categorical(y_test, num_classes).squeeze()

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    print(x_train.shape, 'train shape')
    # with tf.device('/cpu:0'):
    model_init = make_model(x_train.shape, num_classes,
                            filepath if checkpt_flag else None)

    # model_init = partial(make_model, x_train.shape, num_classes,
    #                      filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks = [checkpoint]

    lr = 0.0001
    if mgpu > 1 or mgpu == -1:
        gpus_list = get_available_gpus(mgpu)
        ngpus = len(gpus_list)
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        batch_size = batch_size * ngpus  #
        lr = lr * ngpus
        # batch_size = 40000  # split over four devices works fine no grad avg
        # batch_size = 25000  # split over four devices works fine w/ grad avg

        # Data-Parallelize the model via function or class.
        model = make_parallel(model_init,
                              gpus_list,
                              usenccl=usenccl,
                              syncopt=syncopt,
                              enqueue=enqueue)
        # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list,
        #                   syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=lr, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list)

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=lr, decay=1e-6)

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

        # Fit the model on the batches generated by datagen.flow().
        # mygen = mygenerator(nsamples, batch_size, x_train, y_train)
        # model.fit_generator(mygen,
        #                     steps_per_epoch=steps_per_epoch,
        #                     epochs=epochs,
        #                     validation_data=(x_test, y_test),
        #                     callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    model_init.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])
    metrics = model_init.evaluate(x=x_test, y=y_test, batch_size=batch_size)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
Exemplo n.º 4
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    gdev_list = get_available_gpus(mgpu or 1)
    ngpus = len(gdev_list)

    batch_size_1gpu = 32
    batch_size = batch_size_1gpu * ngpus
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = y_test.shape[0]
    steps_per_epoch = train_samples // batch_size
    # validations_steps = test_samples // batch_size
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which
    # was fixed in Keras 2.1.1
    y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze()
    y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze()

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # if no augmentation can go directly from numpy arrays
        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[x_train, y_train],
        #     # tensors=[x_train, y_train.astype(np.int32)],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     enqueue_many=enqueue_many,
        #     num_threads=8)

        input_images = tf.constant(x_train.reshape(train_samples, -1))
        input_labels = tf.constant(y_train)  # already in proper shape

        image, label = tf.train.slice_input_producer(
            [input_images, input_labels], shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, y_test], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(distorted_image,
                                                         max_delta=63)
            distorted_image = tf.image.random_contrast(distorted_image,
                                                       lower=0.2,
                                                       upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch([image, label],
                                                      batch_size=batch_size,
                                                      capacity=capacity,
                                                      num_threads=8)

        # https://stackoverflow.com/a/43613376/3457624
        x_test_batch, y_test_batch = tf.train.batch(
            [test_image, test_label],
            batch_size=test_samples,  # if converting to numpy first
            # batch_size=batch_size, # if using tensors
            capacity=capacity,
            # num_threads=8,
            num_threads=1,  # set to 1 to make deterministic
            name='test_batch',
            shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    model_init = make_model(x_train_input, num_classes,
                            filepath if checkpt_flag else None)
    x_train_out = model_init.output
    # model_init.summary()
    model_init = Model(inputs=[x_train_input], outputs=[x_train_out])

    lr = 0.0001 * ngpus
    if ngpus > 1:
        model = make_parallel(model_init, gdev_list)
    else:
        # Must re-instantiate model per API below otherwise doesn't work.
        model = model_init

    opt = RMSprop(lr=lr, decay=1e-6)
    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    print_mgpu_modelsummary(model)  # will print non-mgpu model as well

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True)
        callbacks += [checkpoint]

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    # Start the queue runners.
    sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.
    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag:  # empty list
        model.save_weights(checkptfile)

    KB.clear_session()

    # Second Session. Demonstrate that the model works
    # test_model = make_model(x_test.shape[1:], num_classes,
    #                         weights_file=weights_file)
    test_model = make_model(x_test.shape[1:], num_classes)
    test_model.load_weights(weights_file)
    test_model.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])

    if data_augmentation:
        # Need to run x_test through per_image_standardization otherwise
        # results get messed up.
        x_processed, y_processed = sess.run([x_test_batch, y_test_batch])
        # DEBUGGING
        # xdiff = np.abs(x_test - x_processed)
        # print('MAX XDIFF: {}'.format(np.max(xdiff)))
        # ydiff = np.abs(y_test - y_processed)
        # print('y_test: {}'.format(y_test[0:5, :]))
        # print('y_processed: {}'.format(y_processed[0:5, :]))
        # print('ydiff: {}'.format(ydiff[-10:, :]))
        # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff))))

        loss, acc = test_model.evaluate(x_processed, y_processed)
    else:
        loss, acc = test_model.evaluate(x_test, y_test)

    # # Demonstrate that the model works using TF pipeline directly.
    # # In tf.train.batch for test data change batch_size=batch_size
    # # instead of train_samples. Uncomment below and comment out above.
    # val_samples = x_test.shape[0]
    # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size)))
    # images_val = KL.Input(tensor=x_test_batch)
    # test_model = make_model(images_val, num_classes,
    #                         weights_file)
    # test_model = Model(inputs=[images_val], outputs=[test_model.output])
    # test_model.compile(
    #     loss='categorical_crossentropy',
    #     optimizer=opt,
    #     metrics=['accuracy'],
    #     target_tensors=[y_test_batch])
    # loss, acc = test_model.evaluate(x=None, y=None,
    #                                 steps=steps_per_epoch_val)

    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    # Data split if going for reduction in each iteration step. Using
    # tf-queue or dataset is better to preserve uniform random sampling.
    # nsamples = x_train.shape[0]
    # mysamples = nsamples // hvdsize
    # start_sam = hvd.local_rank() * mysamples
    # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples)
    # x_train = x_train[start_sam:stop_sam, ...]

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    vae, encoder, generator = make_vae_and_codec(original_img_size, img_chns,
                                                 img_rows, img_cols,
                                                 batch_size, filters, num_conv,
                                                 intermediate_dim, latent_dim,
                                                 epsilon_std)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(x_test, None),
            callbacks=callbacks)

    if hvd.rank() == 0:
        vae_val = vae
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size:(i + 1) * digit_size,
                       j * digit_size:(j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
Exemplo n.º 6
0
def main(argv=None):
    '''Multigpu example using Keras for Cifar10 training.'''
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    # CLI parser
    args = parser_(main.__doc__)

    logdevp = args.logdevp

    gpu_options = tf.GPUOptions(allow_growth=True)
    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True,
            gpu_options=gpu_options)
        # config.gpu_options.allow_growth = True
        KB.set_session(tf.Session(config=config))
    else:
        config = tf.ConfigProto(gpu_options=gpu_options)
        KB.set_session(tf.Session(config=config))

    mgpu = 0 if args.mgpu is None else args.mgpu
    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    syncopt = args.syncopt

    checkpt = args.checkpt
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size * ngpus if ngpus > 1 else args.batch_size
    num_classes = 10
    epochs = args.epochs

    datadir = args.datadir

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    if not args.use_dataset_api:
        traingen = ImageDataGenerator()
        if args.aug:
            print('Using real-time data augmentation.')
            # This will do preprocessing and realtime data augmentation:
            traingen = ImageDataGenerator(
                # set input mean to 0 over the dataset
                featurewise_center=False,
                # set each sample mean to 0
                samplewise_center=False,
                # divide inputs by std of the dataset
                featurewise_std_normalization=False,
                # divide each input by its std
                samplewise_std_normalization=False,
                # apply ZCA whitening
                zca_whitening=False,
                # randomly rotate images in the range (degrees, 0 to 180)
                rotation_range=0,
                # randomly shift images horizontally (fraction of total width)
                width_shift_range=0.1,
                # randomly shift images vertically (fraction of total height)
                height_shift_range=0.1,
                # randomly flip images
                horizontal_flip=True,
                # randomly flip images
                vertical_flip=False)

            # Compute quantities required for feature-wise normalization
            # (std, mean, and principal components if ZCA whitening is applied)
            traingen.fit(x_train)

        # x_train_input = KL.Input(shape=x_train.shape[1:])
        model_init = make_model(
            x_train.shape[1:], num_classes, filepath)
    else:
        print('USING TF DATASET API.')
        dataset = wrap_as_tfdataset(
            x_train, y_train, args.aug, batch_size)
        iterator = dataset.make_one_shot_iterator()

        # Model creation using tensors from the get_next() graph node.
        inputs, targets = iterator.get_next()
        x_train_input = KL.Input(tensor=inputs)

        model_init_ = make_model(x_train_input, num_classes, filepath)
        x_train_out = model_init_.output

        model_init = Model(inputs=[x_train_input], outputs=[x_train_out])

    lr = 0.0001
    if ngpus > 1:
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        lr = lr * ngpus

        # Data-Parallelize the model via function or class.
        if args.mgpu_type == 'kerasmgpu':
            gpus_list_int = get_available_gpus(
                ngpus, list_type=GPUListType.int_id)
            model = ModelKerasMGPU(model_init, gpus_list_int)
        else:
            model = ModelMGPU(
                serial_model=model_init, gdev_list=gpus_list)

        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=lr, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list)  # @IgnorePep8 pylint: disable=unexpected-keyword-arg

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=lr, decay=1e-6)

    model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'],
        target_tensors=None if not args.use_dataset_api else [targets])

    callbacks = []
    if checkpt:
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')
        callbacks = [checkpoint]

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not args.use_dataset_api:
        start_time = time.time()
        # Fit the model on the batches generated by traingen.flow().
        model.fit_generator(
            traingen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test),
            callbacks=callbacks)

    else:
        # augmentation incorporated in the Dataset pipeline
        start_time = time.time()
        # Validation during training can be incorporated via callback:
        # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56
        model.fit(
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks)

    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'
          .format('TRAINING', round(elapsed_time, 3)))

    test_model = model_init
    if args.use_dataset_api:
        # Create a test-model without Dataset pipeline in the model graph.
        test_model = make_model(x_test.shape[1:], num_classes)
        print('SETTING WEIGHTS FOR EVAL WITH DATASET API...')
        test_model.set_weights(model.get_weights())
        print('WEIGHTS SET!!!')

    test_model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'])

    metrics = test_model.evaluate(x_test, y_test)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    KB.clear_session()
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size,
                                    filters, num_conv, intermediate_dim,
                                    latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae_serial = make_vae(ldict, x)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks)

    x = Input(shape=original_img_size)
    vae_val = make_vae(ldict, x)
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    x = Input(shape=original_img_size)
    z_mean, _ = get_encoded(ldict, x)
    encoder = Model(x, z_mean)
    # :  :type encoder: Model

    decoder_input = Input(shape=(latent_dim, ))
    x_decoded_mean_squash = get_decoded(ldict, decoder_input)
    generator = Model(decoder_input, x_decoded_mean_squash)
    # :  :type generator: Model

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size:(i + 1) * digit_size,
                   j * digit_size:(j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
Exemplo n.º 8
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    print('x_train.shape:', x_train.shape)

    vae_serial, encoder, generator = make_vae_and_codec(
        original_img_size, img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks)  # ,
    # validation_data=(x_test, None))  # Not accurate for mgpu. Use vae_val.

    vae_val = vae_serial
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size:(i + 1) * digit_size,
                   j * digit_size:(j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    # Initialize Horovod.
    hvd.init()

    logdevp = args.logdevp  # For debugging
    log_device_placement, allow_soft_placement = (True, True) \
        if _DEVPROF or logdevp else (False, False)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto(log_device_placement=log_device_placement,
                            allow_soft_placement=allow_soft_placement)
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    KB.set_session(tf.Session(config=config))

    hvdsize = hvd.size()

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = x_test.shape[0]
    steps_per_epoch = train_samples // batch_size // hvdsize
    # validations_steps = test_samples // batch_size
    print(train_samples, 'train samples')
    print(test_samples, 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze()
    y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze()

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # if no augmentation can go directly from numpy arrays
        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[x_train, y_train],
        #     # tensors=[x_train, y_train.astype(np.int32)],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     enqueue_many=enqueue_many,
        #     num_threads=8)

        input_images = tf.constant(x_train.reshape(train_samples, -1))
        input_labels = tf.constant(y_train)  # already in proper shape

        image, label = tf.train.slice_input_producer(
            [input_images, input_labels], shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])
        # label = tf.one_hot(label, num_classes)

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_labels = tf.constant(y_test)  # already in proper shape
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, test_labels], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(distorted_image,
                                                         max_delta=63)
            distorted_image = tf.image.random_contrast(distorted_image,
                                                       lower=0.2,
                                                       upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch([image, label],
                                                      batch_size=batch_size,
                                                      capacity=capacity,
                                                      num_threads=8)

        x_test_batch, y_test_batch = tf.train.batch([test_image, test_label],
                                                    batch_size=test_samples,
                                                    capacity=capacity,
                                                    num_threads=1,
                                                    name='test_batch',
                                                    shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    callbacks = []

    model_init = make_model(x_train_input, num_classes,
                            filepath if checkpt_flag else None)
    x_train_out = model_init.output
    # model_init.summary()

    model = Model(inputs=[x_train_input], outputs=[x_train_out])
    lr = 0.0001 * hvdsize
    # opt = RMSprop(lr=lr, decay=1e-6)
    # opt = hvd_keras.DistributedOptimizer(opt)  # , use_locking=True)

    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)  # Required for tf.train based optimizers

    # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER
    sess = KB.get_session()  # RUN BROADCAST_GLOBAL_VARIABLES
    # -------------------------------------------------------------------------

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    if hvd.rank() == 0:
        model.summary()

    # Broadcast initial variable states from rank 0 to all other procs.
    # This is necessary to ensure consistent initialization of all
    # workers when training is started with random weights or restored
    # from a checkpoint.
    # Callback when using horovod.keras as hvd
    # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    sess.run(hvd.broadcast_global_variables(0))  # horovod.tensorflow as hvd

    if checkpt_flag and hvd.rank() == 0:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True)
        callbacks.append(checkpoint)

    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    # Start the queue runners.
    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.
    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks,
        verbose=hvd.rank() == 0)
    elapsed_time = time.time() - start_time

    if hvd.rank() == 0:
        print('[{}] finished in {} s'.format('TRAINING',
                                             round(elapsed_time, 3)))

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag and hvd.rank() == 0:
        model.save_weights(checkptfile)

    # KB.clear_session()  # don't clear session just yet.

    if hvd.rank() == 0:
        # Second Session. Demonstrate that the model works
        # test_model = make_model(x_test.shape[1:], num_classes,
        #                         weights_file=weights_file)
        test_model = make_model(x_test.shape[1:], num_classes)
        test_model.load_weights(weights_file)
        test_model.compile(loss='categorical_crossentropy',
                           optimizer=opt,
                           metrics=['accuracy'])

        if data_augmentation:
            x_processed, y_processed = sess.run([x_test_batch, y_test_batch])
            metrics = test_model.evaluate(x_processed, y_processed)
        else:
            metrics = test_model.evaluate(x_test, y_test)

        print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()
def main(argv=None):
    '''Train a simple deep CNN on the CIFAR10 small images dataset on multigpu
    (and optionally multinode+multigpu) systems via Horovod implementation.
    '''
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__
    # CLI parser
    # args = parser_(argv[1:], desc)
    args = parser_(desc)

    # Initialize Horovod.
    hvd.init()

    logdevp = args.logdevp  # For debugging
    log_device_placement, allow_soft_placement = (True, True) \
        if _DEVPROF or logdevp else (False, False)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to local rank. Typically one GPU per process unless
    # oversubscribing GPUs (experimental MPS). In model parallelism it's
    # possible to have multiple GPUs per process.
    # visible_device_list = str(hvd.local_rank()
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=str(gpu_local_rank))
    config = tf.ConfigProto(log_device_placement=log_device_placement,
                            allow_soft_placement=allow_soft_placement,
                            gpu_options=gpu_options)
    KB.set_session(tf.Session(config=config))

    hvdsize = hvd.size()

    checkpt = args.checkpt
    filepath = checkpt

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs

    datadir = args.datadir

    # The data, shuffled and split between train and test sets:
    if hvd.rank() == 0:
        # download only in rank0 i.e. single process
        (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)

    hvd_keras.allreduce([0], name="Barrier")
    if hvd.rank() != 0:
        # Data should be downloaded already so load in the other ranks.
        (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)

    train_samples = x_train.shape[0]
    test_samples = x_test.shape[0]
    steps_per_epoch = train_samples // batch_size // hvdsize

    print_rank0('{} train samples'.format(train_samples), hvd)
    print_rank0('{} test samples'.format(test_samples), hvd)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    if not args.use_dataset_api:
        traingen = ImageDataGenerator()
        if args.aug:
            print_rank0('Using real-time data augmentation.', hvd)
            # This will do preprocessing and realtime data augmentation:
            traingen = ImageDataGenerator(
                # set input mean to 0 over the dataset
                featurewise_center=False,
                # set each sample mean to 0
                samplewise_center=False,
                # divide inputs by std of the dataset
                featurewise_std_normalization=False,
                # divide each input by its std
                samplewise_std_normalization=False,
                # apply ZCA whitening
                zca_whitening=False,
                # randomly rotate images in the range (degrees, 0 to 180)
                rotation_range=0,
                # randomly shift images horizontally (fraction of total width)
                width_shift_range=0.1,
                # randomly shift images vertically (fraction of total height)
                height_shift_range=0.1,
                # randomly flip images
                horizontal_flip=True,
                # randomly flip images
                vertical_flip=False)

            # Compute quantities required for feature-wise normalization
            # (std, mean, and principal components if ZCA whitening is applied)
            traingen.fit(x_train)

        model = make_model(x_train.shape[1:], num_classes, filepath)
    else:
        print_rank0('USING TF DATASET API.', hvd)
        dataset = wrap_as_tfdataset(x_train,
                                    y_train,
                                    args.aug,
                                    batch_size,
                                    gpu_local_rank,
                                    prefetch_to_device=True,
                                    comm=hvd_keras)
        iterator = dataset.make_one_shot_iterator()

        # Model creation using tensors from the get_next() graph node.
        inputs, targets = iterator.get_next()
        x_train_input = KL.Input(tensor=inputs)

        model_init = make_model(x_train_input, num_classes, filepath)
        x_train_out = model_init.output

        model = Model(inputs=[x_train_input], outputs=[x_train_out])

    # Let's train the model using RMSprop
    lr = 0.0001 * hvdsize

    # opt = KO.RMSprop(lr=lr, decay=1e-6)
    # opt = hvd_keras.DistributedOptimizer(opt)

    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'],
        target_tensors=None if not args.use_dataset_api else [targets])

    if hvd.rank() == 0:
        model.summary()

    callbacks = []
    if checkpt and hvd.rank() == 0:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='loss',
                                     mode='min',
                                     verbose=1,
                                     save_best_only=True)
        callbacks.append(checkpoint)

    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    # Broadcast initial variable states from rank 0 to all other procs.
    # This is necessary to ensure consistent initialization of all
    # workers when training is started with random weights or restored
    # from a checkpoint.
    # Callback when using horovod.keras as hvd
    # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    KB.get_session().run(hvd.broadcast_global_variables(0))

    if not args.use_dataset_api:
        start_time = time.time()
        # Fit the model on the batches generated by traingen.flow().
        model.fit_generator(
            traingen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test) if hvd.rank() == 0 else None,
            verbose=hvd.rank() == 0,
            callbacks=callbacks)
    else:
        # augmentation incorporated in the Dataset pipeline
        start_time = time.time()
        # Validation during training can be incorporated via callback:
        # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56
        model.fit(steps_per_epoch=steps_per_epoch,
                  epochs=epochs,
                  verbose=hvd.rank() == 0,
                  callbacks=callbacks)

    if hvd.rank() != 0:
        return

    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    test_model = model
    if args.use_dataset_api:
        # Create a test-model without Dataset pipeline in the model graph.
        test_model = make_model(x_test.shape[1:], num_classes)
        test_model.compile(loss=keras_losses.categorical_crossentropy,
                           optimizer=opt,
                           metrics=['accuracy'])
        print('SETTING WEIGHTS FOR EVAL WITH DATASET API...')
        test_model.set_weights(model.get_weights())
        print('WEIGHTS SET!!!')

    metrics = test_model.evaluate(x_test, y_test)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))