def train(create_model, X, y, batch_size, epochs, gpu_count, parameter_server, method): if gpu_count > 1: ps_device = '/gpu:0' if parameter_server == 'gpu' else '/cpu:0' with tf.device(ps_device): serial_model = create_model() if method == 'kuza55': from keras_tf_multigpu.kuza55 import make_parallel model = make_parallel(serial_model, gpu_count=gpu_count, ps_device=ps_device) elif method == 'avolkov1': from keras_tf_multigpu.avolkov1 import make_parallel, get_available_gpus gpus_list = get_available_gpus(gpu_count) model = make_parallel(serial_model, gdev_list=gpus_list, ps_device=ps_device) elif method == 'fchollet': # requires Keras (2.0.9?) from keras.utils import multi_gpu_model model = multi_gpu_model(serial_model, gpus=gpu_count) else: model = serial_model = create_model() print('Number of parameters:', serial_model.count_params()) model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) gauge = SamplesPerSec(batch_size), y, batch_size=batch_size, epochs=epochs, callbacks=[gauge]) gauge.print_results()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) gdev_list = get_available_gpus(mgpu or 1) ngpus = len(gdev_list) batch_size_1gpu = 32 batch_size = batch_size_1gpu * ngpus num_classes = 1000 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = synthesize_imagenet_dataset(num_classes) train_samples = x_train.shape[0] test_samples = y_test.shape[0] steps_per_epoch = train_samples // batch_size print('train_samples:', train_samples) print('batch_size:', batch_size) print('steps_per_epoch:', steps_per_epoch) # validations_steps = test_samples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # if no augmentation can go directly from numpy arrays # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[x_train, y_train], # # tensors=[x_train, y_train.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # enqueue_many=enqueue_many, # num_threads=8) # NOTE: This bakes the whole dataset into the TF graph and for larger # datasets it fails on "ValueError: GraphDef cannot be larger than 2GB". # TODO: Load the a large dataset via queue from RAM/disk. input_images = tf.constant(x_train.reshape(train_samples, -1)) print('train_samples', train_samples) print('input_images', input_images.shape) image, label = tf.train.slice_input_producer([input_images, y_train], shuffle=True) # If using num_epochs=epochs have to: # # and maybe also: image = tf.reshape(image, x_train.shape[1:]) print('image', image.shape) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_image, test_label = tf.train.slice_input_producer( [test_images, y_test], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch([image, label], batch_size=batch_size, capacity=capacity, num_threads=8) print('x_train_batch:', x_train_batch.shape) # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[image, label], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # num_threads=8) x_test_batch, y_test_batch = tf.train.batch( [test_image, test_label], # TODO: shouldn't it be: batch_size=batch_size??? batch_size=train_samples, capacity=capacity, num_threads=8, name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) print('x_train_input', x_train_input) gauge = SamplesPerSec(batch_size) callbacks = [gauge] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) model_init = make_model(x_train_input, num_classes) x_train_out = model_init.output # model_init.summary() lr = 0.0001 * ngpus if ngpus > 1: model = make_parallel(model_init, gdev_list) else: # Must re-instantiate model per API below otherwise doesn't work. model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) model = model_init opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) print_mgpu_modelsummary(model) # will print non-mgpu model as well if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks = [checkpoint] # Start the queue runners. sess = KB.get_session() #[tf.local_variables_initializer(), # tf.global_variables_initializer()]) tf.train.start_queue_runners(sess=sess) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} ms'.format('TRAINING', int(elapsed_time * 1000))) gauge.print_results() weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag: # empty list model.save_weights(checkptfile) # Clean up the TF session. coord.request_stop() coord.join(threads) KB.clear_session() # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: x_proccessed = y_proccessed = loss, acc = test_model.evaluate(x_proccessed, y_proccessed) else: loss, acc = test_model.evaluate(x_test, y_test) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc))
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu enqueue = args.enqueue usenccl = args.nccl syncopt = args.syncopt checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: # (x_train, y_train), (x_test, y_test) = cifar10.load_data() (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) print(x_train.shape, 'train shape') # with tf.device('/cpu:0'): model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) # model_init = partial(make_model, x_train.shape, num_classes, # filepath if checkpt_flag else None) if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks += [checkpoint] lr = 0.0001 if mgpu > 1 or mgpu == -1: gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) print('Using GPUs: {}'.format(', '.join(gpus_list))) batch_size = batch_size * ngpus # lr = lr * ngpus # batch_size = 40000 # split over four devices works fine no grad avg # batch_size = 25000 # split over four devices works fine w/ grad avg # Data-Parallelize the model via function or class. model = make_parallel(model_init, gpus_list, usenccl=usenccl, syncopt=syncopt, enqueue=enqueue) # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list, # syncopt=syncopt, usenccl=usenccl, enqueue=enqueue) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=lr, decay=1e-6) else: opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list) else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=lr, decay=1e-6) gauge = SamplesPerSec(batch_size) callbacks += [gauge] # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not data_augmentation: print('Not using data augmentation.'), y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) # Fit the model on the batches generated by datagen.flow(). # mygen = mygenerator(nsamples, batch_size, x_train, y_train) # model.fit_generator(mygen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, # validation_data=(x_test, y_test), # callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) gauge.print_results()
# This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization= False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range= 0, # randomly rotate images in the range (degrees, 0 to 180) width_shift_range= 0.1, # randomly shift images horizontally (fraction of total width) height_shift_range= 0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=x_train.shape[0] // batch_size, epochs=epochs, validation_data=(x_test, y_test), workers=4, callbacks=[gauge]) gauge.print_results()