class DeepGalaxyTraining(object): def __init__(self): self.data_io = DataIO() self.model = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.num_classes = 0 self.epochs = 50 self.batch_size = 8 self.use_noise = True self.distributed_training = False self.multi_gpu_training = False self._multi_gpu_model = None self._n_gpus = 1 self.callbacks = [] self.logger = None self.log_level = logging.DEBUG self.input_shape = (512, 512, 3) # (256, 256, 3) self._t_start = 0 self._t_end = 0 def get_flops(self, model): # run_meta = tf.RunMetadata() # commented out since it doesn't work in TF2 run_meta = tf.compat.v1.RunMetadata() # opts = tf.profiler.ProfileOptionBuilder.float_operation() opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() # We use the Keras session graph in the call to the profiler. flops = tf.compat.v1.profiler.profile( graph=tf.compat.v1.keras.backend.get_session().graph, run_meta=run_meta, cmd='op', options=opts) return flops.total_float_ops # Prints the "flops" of the model. def initialize(self): # init_op = tf.initialize_all_variables() # init_op = tf.global_variables_initializer() # sess = tf.Session() # sess.run(init_op) # Check if GPUs are available # if tf.test.is_gpu_available(): # commented out since this test will cause a new session be created # allow growth # config = tf.compat.v1.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 1 # config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU # # config.log_device_placement = True # to log device placement (on which device the operation ran) # sess = tf.compat.v1.Session(config=config) # tf.compat.v1.keras.backend.set_session(sess) # set this TensorFlow session as the default session for Keras # Create logger self.logger = logging.getLogger('DeepGalaxyTrain') self.logger.setLevel(self.log_level) self.logger.addHandler(logging.FileHandler('train_log.txt')) if self.distributed_training is True: try: import horovod.tensorflow.keras as hvd # initialize horovod hvd.init() self.callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.callbacks.append(hvd.callbacks.MetricAverageCallback()) # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)] if hvd.rank() == 0: self.logger.info('Parallel training enabled.') self.logger.info( 'batch_size = %d, global_batch_size = %d, num_workers = %d\n' % (self.batch_size, self.batch_size * hvd.size(), hvd.size())) # Map an MPI process to a GPU (Important!) print('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) # Bind a CUDA device to one MPI process (has no effect if GPUs are not used) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) # # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # if gpus: # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') except ImportError as identifier: print( 'Error importing horovod. Disabling distributed training.') self.distributed_training = False else: self.logger.info('Parallel training disabled.') self.logger.info('Batch_size = %d' % (self.batch_size)) def load_data(self, data_fn, test_size=0.3, random=True): if not self.distributed_training: self.logger.info( 'Loading the full dataset since distributed training is disabled ...' ) # X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos) X, Y = self.data_io.load_all(data_fn) else: self.logger.info( 'Loading part of the dataset since distributed training is enabled ...' ) X, Y = self.data_io.load_partial(data_fn, hvd.size(), hvd.rank()) self.logger.debug('Shape of X: %s' % str(X.shape)) self.logger.debug('Shape of Y: %s' % str(Y.shape)) # update the input_shape setting according to the loaded data self.input_shape = X.shape[1:] if test_size > 0: x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=42) self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test else: self.x_train = X self.y_train = Y self.num_classes = np.unique(Y).shape[0] print("shapes:", self.x_train.shape, self.x_test.shape, self.y_train.shape, self.y_test.shape) self.logger.debug('Number of classes: %d' % self.num_classes) def load_model(self): if not os.path.isfile('efn_b4.h5'): # base_model = efn.EfficientNetB4(weights='imagenet', include_top=False, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes) base_model = efn.EfficientNetB4(weights=None, include_top=True, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes) if self.distributed_training is True and hvd.rank == 0: base_model.save('efn_b4.h5') else: base_model = tf.keras.models.load_model('efn_b4.h5', compile=False) print(base_model.summary()) if not self.use_noise: # x = base_model.output # x = tf.keras.layers.GlobalAveragePooling2D()(x) # x = tf.keras.layers.Dropout(0.3)(x) # predictions = tf.keras.layers.Dense(self.num_classes, activation='softmax')(x) # model = tf.keras.models.Model(inputs = base_model.input, outputs = predictions) # model = tf.keras.models.Model(inputs = base_model.input, outputs = base_model.outputs) model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape)) # commented out since tf.repeat does not exist before 1.15 model.add( tf.keras.layers.Lambda( lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1), input_shape=self.input_shape)) model.add(base_model) # model.add(tf.keras.layers.GlobalAveragePooling2D()) # model.add(tf.keras.layers.Dropout(0.3)) # model.add(tf.keras.layers.Dense(self.num_classes, activation='softmax')) else: model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape)) # commented out since tf.repeat does not exist before 1.15 model.add( tf.keras.layers.Lambda( lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1), input_shape=self.input_shape)) model.add( tf.keras.layers.GaussianNoise(0.5, input_shape=self.input_shape)) model.add(base_model) # model.add(tf.keras.layers.GlobalAveragePooling2D(name="gap")) # model.add(tf.keras.layers.Dropout(0.3)) # model.add(tf.keras.layers.Dense(self.num_classes, activation="softmax", name="fc_out")) if self.distributed_training is True: # opt = K.optimizers.SGD(0.001 * hvd.size()) # opt = tf.keras.optimizers.Adam(hvd.size()) opt = tf.keras.optimizers.Adadelta(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) else: opt = tf.keras.optimizers.Adam() if self.multi_gpu_training is True: # probe the number of GPUs from tensorflow.python.client import device_lib local_device_protos = device_lib.list_local_devices() gpu_list = [ x.name for x in local_device_protos if x.device_type == 'GPU' ] self._n_gpus = len(gpu_list) print('Parallalizing the model on %d GPUs...' % self._n_gpus) parallel_model = tf.keras.utils.multi_gpu_model(model, gpus=self._n_gpus) parallel_model.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=opt, metrics=['sparse_categorical_accuracy']) self._multi_gpu_model = parallel_model self.model = model print(parallel_model.summary()) else: model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=opt, metrics=['sparse_categorical_accuracy']) self.model = model if self.distributed_training is True: if hvd.rank() == 0: print(model.summary()) else: print(model.summary()) def fit(self): if self.distributed_training is True: try: # print('len(train_iter)', len(train_iter)) # if hvd.rank() == 0: # self.f_usage.write('len(train_iter) = %d, x_train.shape=%s\n' % (len(train_iter), x_train.shape)) self._t_start = datetime.now() self.model.fit(self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=self.callbacks, verbose=1 if hvd.rank() == 0 else 0, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() # train_gen = ImageDataGenerator() # train_iter = train_gen.flow(self.x_train, self.y_train, batch_size=self.batch_size) # test_gen = ImageDataGenerator() # test_iter = test_gen.flow(self.x_test, self.y_test, batch_size=self.batch_size) # self.model.fit_generator(train_iter, # # batch_size=batch_size, # steps_per_epoch=len(train_iter) // hvd.size(), # epochs=self.epochs, # callbacks=self.callbacks, # verbose=1 if hvd.rank() == 0 else 0, # validation_data=test_gen.flow(self.x_test, self.y_test, self.batch_size), # validation_steps=len(test_iter) // hvd.size()) except KeyboardInterrupt: print('Terminating due to Ctrl+C...') finally: print( "On hostname {0} - After training using {1} GB of memory". format( socket.gethostname(), psutil.Process(os.getpid()).memory_info()[0] / 1024 / 1024 / 1024)) self._t_end = datetime.now() if hvd.rank() == 0: self.logger.info( "On hostname {0} - After training using {1} GB of memory\n" .format( socket.gethostname(), psutil.Process(os.getpid()).memory_info()[0] / 1024 / 1024 / 1024)) self.logger.info('Time is now %s\n' % datetime.now()) # self.f_usage.write('Elapsed time %s\n' % (t_end-t_start)) # print('Elapsed time:', t_end-t_start) else: try: if self.multi_gpu_training is True: self._t_start = datetime.now() self._multi_gpu_model.fit( self.x_train, self.y_train, batch_size=self.batch_size * self._n_gpus, epochs=self.epochs, # callbacks=self.callbacks, verbose=1, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() else: self._t_start = datetime.now() self.model.fit( self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.epochs, # callbacks=self.callbacks, verbose=1, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() except KeyboardInterrupt: pass finally: self._t_end = datetime.now() print('Elapsed time:', self._t_end - self._t_start) print('Saving model...') print(self.get_flops(self.model)) def save_model(self): if self.distributed_training is True: if hvd.rank() == 0: if self.use_noise is True: self.model.save('model_hvd_bw_%d_B0_with_noise_n_p_%d.h5' % (self.input_shape[0], hvd.size())) else: self.model.save('model_hvd_bw_%d_B0_no_noise_%d_nodes.h5' % (self.input_shape[0], hvd.size())) else: if self.use_noise is True: self.model.save('model_bw_%d_B0_with_noise.h5' % (self.input_shape[0])) else: self.model.save('model_bw_%d_B0_no_noise.h5' % (self.input_shape[0])) def validate(self): y_pred = self.model.predict(self.x_test) print( precision_recall_fscore_support(self.y_test, np.argmax(y_pred, axis=1))) print(confusion_matrix(self.y_test, np.argmax(y_pred, axis=1))) def finalize(self): pass