def calculate_speed(self, t0, t_after_deltas, t_after_update, num_replicas, verbose=False): effective_batch_size = self.get_effective_batch_size(num_replicas) t_calculate = t_after_deltas - t0 t_sync = t_after_update - t_after_deltas t_tot = t_after_update - t0 examples_per_sec = effective_batch_size / t_tot frac_calculate = t_calculate / t_tot frac_sync = t_sync / t_tot print_str = ( '{:.2E} Examples/sec | {:.2E} sec/batch '.format( examples_per_sec, t_tot) + '[{:.1%} calc., {:.1%} sync.]'.format(frac_calculate, frac_sync)) print_str += '[batch = {} = {}*{}] [lr = {:.2E} = {:.2E}*{}]'.format( effective_batch_size, self.batch_size, num_replicas, self.get_effective_lr(num_replicas), self.lr, num_replicas) if verbose: g.write_unique(print_str) return print_str
def get_effective_lr(self, num_replicas): effective_lr = self.lr * num_replicas if effective_lr > self.max_lr: g.write_unique( 'Warning: effective learning rate set to {}, '.format( effective_lr) + 'larger than maximum {}. Clipping.'.format(self.max_lr)) effective_lr = self.max_lr return effective_lr
def mpi_train(conf, shot_list_train, shot_list_validate, loader, callbacks_list=None, shot_list_test=None): loader.set_inference_mode(False) # TODO(KGF): this is not defined in conf.yaml, but added to processed dict # for the first time here: conf['num_workers'] = g.comm.Get_size() specific_builder = builder.ModelBuilder(conf) if g.tf_ver >= parse_version('1.14.0'): # Internal TensorFlow flags, subject to change (v1.14.0+ only?) try: from tensorflow.python.util import module_wrapper as depr except ImportError: from tensorflow.python.util import deprecation_wrapper as depr # depr._PRINT_DEPRECATION_WARNINGS = False # does nothing depr._PER_MODULE_WARNING_LIMIT = 0 # Suppresses warnings from "keras/backend/tensorflow_backend.py" # except: "Rate should be set to `rate = 1 - keep_prob`" # Also suppresses warnings from "keras/optimizers.py # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py" else: # TODO(KGF): next line suppresses ALL info and warning messages, # not just deprecation warnings... tf.logging.set_verbosity(tf.logging.ERROR) # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of # deprecation warnings with externally-packaged Keras, e.g.: # WARNING:tensorflow:From .../keras/backend/tensorflow_backend.py:174: # The name tf.get_default_session is deprecated. # Please use tf.compat.v1.get_default_session instead. train_model = specific_builder.build_model(False) # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf" # # TODO(KGF): note, these are different than C-based info diagnostics e.g.: # 2019-11-06 18:27:31.698908: I ... dynamic library libcublas.so.10 # which are NOT suppressed by set_verbosity. See top level __init__.py # load the latest epoch we did. Returns 0 if none exist yet e = specific_builder.load_model_weights(train_model) e_old = e num_epochs = conf['training']['num_epochs'] lr_decay = conf['model']['lr_decay'] batch_size = conf['training']['batch_size'] lr = conf['model']['lr'] clipnorm = conf['model']['clipnorm'] warmup_steps = conf['model']['warmup_steps'] # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch" num_batches_minimum = conf['training']['num_batches_minimum'] if 'adam' in conf['model']['optimizer']: optimizer = MPIAdam(lr=lr) elif (conf['model']['optimizer'] == 'sgd' or conf['model']['optimizer'] == 'tf_sgd'): optimizer = MPISGD(lr=lr) elif 'momentum_sgd' in conf['model']['optimizer']: optimizer = MPIMomentumSGD(lr=lr) else: print("Optimizer not implemented yet") exit(1) g.print_unique('{} epoch(s) left to go'.format(num_epochs - e)) batch_generator = partial(loader.training_batch_generator_partial_reset, shot_list=shot_list_train) g.print_unique("warmup steps = {}".format(warmup_steps)) mpi_model = MPIModel(train_model, optimizer, g.comm, batch_generator, batch_size, lr=lr, warmup_steps=warmup_steps, num_batches_minimum=num_batches_minimum, conf=conf) mpi_model.compile(conf['model']['optimizer'], clipnorm, conf['data']['target'].loss) tensorboard = None if g.task_index == 0: tensorboard_save_path = conf['paths']['tensorboard_save_path'] write_grads = conf['callbacks']['write_grads'] tensorboard = TensorBoard(log_dir=tensorboard_save_path, histogram_freq=1, write_graph=True, write_grads=write_grads) tensorboard.set_model(mpi_model.model) # TODO(KGF): check addition of TF model summary write added from fork fr = open('model_architecture.log', 'a') ori = sys.stdout sys.stdout = fr mpi_model.model.summary() sys.stdout = ori fr.close() mpi_model.model.summary() if g.task_index == 0: callbacks = mpi_model.build_callbacks(conf, callbacks_list) callbacks.set_model(mpi_model.model) callback_metrics = conf['callbacks']['metrics'] callbacks.set_params({ 'epochs': num_epochs, 'metrics': callback_metrics, 'batch_size': batch_size, }) callbacks.on_train_begin() if conf['callbacks']['mode'] == 'max': best_so_far = -np.inf cmp_fn = max else: best_so_far = np.inf cmp_fn = min while e < num_epochs: g.write_unique('\nBegin training from epoch {:.2f}/{}'.format( e, num_epochs)) if g.task_index == 0: callbacks.on_epoch_begin(int(round(e))) mpi_model.set_lr(lr * lr_decay**e) # KGF: core work of loop performed in next line (step, ave_loss, curr_loss, num_so_far, effective_epochs) = mpi_model.train_epoch() e = e_old + effective_epochs g.write_unique('Finished training of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): add diagnostic about "saving to epoch X"? loader.verbose = False # True during the first iteration if g.task_index == 0: specific_builder.save_model_weights(train_model, int(round(e))) if conf['training']['no_validation']: break epoch_logs = {} g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): flush output/ MPI barrier? # g.flush_all_inorder() # TODO(KGF): is there a way to avoid Keras.Models.load_weights() # repeated calls throughout mpi_make_pred*() fn calls? _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate( conf, shot_list_validate, loader) if conf['training']['ranking_difficulty_fac'] != 1.0: (_, _, _, roc_area_train, loss_train) = mpi_make_predictions_and_evaluate( conf, shot_list_train, loader) batch_generator = partial( loader.training_batch_generator_partial_reset, shot_list=shot_list_train) mpi_model.batch_iterator = batch_generator mpi_model.batch_iterator_func.__exit__() mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv mpi_model.set_batch_iterator_func() if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): times = conf['callbacks']['monitor_times'] areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_validate, loader, times) epoch_str = 'epoch {}, '.format(int(round(e))) g.write_unique(epoch_str + ' '.join([ 'val_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') if shot_list_test is not None: areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_test, loader, times) g.write_unique(epoch_str + ' '.join([ 'test_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') epoch_logs['val_roc'] = roc_area epoch_logs['val_loss'] = loss epoch_logs['train_loss'] = ave_loss best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']], best_so_far) stop_training = False g.flush_all_inorder() if g.task_index == 0: print('=========Summary======== for epoch {:.2f}'.format(e)) print('Training Loss numpy: {:.3e}'.format(ave_loss)) print('Validation Loss: {:.3e}'.format(loss)) print('Validation ROC: {:.4f}'.format(roc_area)) if conf['training']['ranking_difficulty_fac'] != 1.0: print('Training Loss: {:.3e}'.format(loss_train)) print('Training ROC: {:.4f}'.format(roc_area_train)) print('======================== ') callbacks.on_epoch_end(int(round(e)), epoch_logs) if hasattr(mpi_model.model, 'stop_training'): stop_training = mpi_model.model.stop_training # only save model weights if quantity we are tracking is improving if best_so_far != epoch_logs[conf['callbacks']['monitor']]: if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): print("No improvement, saving model weights anyways") else: print("Not saving model weights") specific_builder.delete_model_weights( train_model, int(round(e))) # tensorboard val_generator = partial(loader.training_batch_generator, shot_list=shot_list_validate)() val_steps = 1 tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)), epoch_logs) stop_training = g.comm.bcast(stop_training, root=0) g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format( e, num_epochs)) # TODO(KGF): compare to old diagnostic: # g.write_unique("end epoch {}".format(e_old)) if stop_training: g.write_unique("Stopping training due to early stopping") break if g.task_index == 0: callbacks.on_train_end() tensorboard.on_train_end() mpi_model.close()
def train_epoch(self): ''' Perform distributed mini-batch SGD for one epoch. It takes the batch iterator function and a NN model from MPIModel object, fetches mini-batches in a while-loop until number of samples seen by the ensemble of workers (num_so_far) exceeds the training dataset size (num_total). NOTE: "sample" = "an entire shot" within this description During each iteration, the gradient updates (deltas) and the loss are calculated for each model replica in the ensemble, weights are averaged over ensemble, and the new weights are set. It performs calls to: MPIModel.get_deltas, MPIModel.set_new_weights methods Argument list: Empty Returns: - step: final iteration number - ave_loss: model loss averaged over iterations within this epoch - curr_loss: training loss averaged over replicas at final iteration - num_so_far: the cumulative number of samples seen by the ensemble of replicas up to the end of the final iteration (step) of this epoch Intermediate outputs and logging: debug printout of task_index (MPI), epoch number, number of samples seen to a current epoch, average training loss ''' verbose = False first_run = True step = 0 loss_averager = Averager() t_start = time.time() timeline_prof = False if (self.conf is not None and conf['training']['timeline_prof']): timeline_prof = True step_limit = 0 if (self.conf is not None and conf['training']['step_limit'] > 0): step_limit = conf['training']['step_limit'] batch_iterator_func = self.batch_iterator_func num_total = 1 ave_loss = -1 curr_loss = -1 t0 = 0 t1 = 0 t2 = 0 while ((self.num_so_far - self.epoch * num_total) < num_total or step < self.num_batches_minimum): if step_limit > 0 and step > step_limit: print('reached step limit') break try: (batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period) = next(batch_iterator_func) except StopIteration: g.print_unique("Resetting batch iterator.") self.num_so_far_accum = self.num_so_far_indiv self.set_batch_iterator_func() batch_iterator_func = self.batch_iterator_func (batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period) = next(batch_iterator_func) self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr # if batches_to_reset: # self.model.reset_states(batches_to_reset) warmup_phase = (step < self.warmup_steps and self.epoch == 0) num_replicas = 1 if warmup_phase else self.num_replicas self.num_so_far = self.mpi_sum_scalars(self.num_so_far_indiv, num_replicas) # run the model once to force compilation. Don't actually use these # values. if first_run: first_run = False t0_comp = time.time() # print('input_dimension:',batch_xs.shape) # print('output_dimension:',batch_ys.shape) _, _ = self.train_on_batch_and_get_deltas( batch_xs, batch_ys, verbose) self.comm.Barrier() sys.stdout.flush() # TODO(KGF): check line feed/carriage returns around this g.print_unique( '\nCompilation finished in {:.2f}s'.format(time.time() - t0_comp)) t_start = time.time() sys.stdout.flush() if np.any(batches_to_reset): reset_states(self.model, batches_to_reset) if ('noise' in self.conf['training'].keys() and self.conf['training']['noise'] is not False): batch_xs = self.add_noise(batch_xs) t0 = time.time() deltas, loss = self.train_on_batch_and_get_deltas( batch_xs, batch_ys, verbose) t1 = time.time() if not is_warmup_period: self.set_new_weights(deltas, num_replicas) t2 = time.time() write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas) curr_loss = self.mpi_average_scalars(1.0 * loss, num_replicas) # g.print_unique(self.model.get_weights()[0][0][:4]) loss_averager.add_val(curr_loss) ave_loss = loss_averager.get_ave() eta = self.estimate_remaining_time( t0 - t_start, self.num_so_far - self.epoch * num_total, num_total) write_str = ( '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format( self.task_index, step, eta, 1.0 * self.num_so_far, num_total) + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss) + 'walltime: {:.4f} | '.format(time.time() - self.start_time)) g.write_unique(write_str + write_str_0) if timeline_prof: # dump profile tl = timeline.Timeline(self.run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() # dump file per iteration with open('timeline_%s.json' % step, 'w') as f: f.write(ctf) step += 1 else: g.write_unique('\r[{}] warmup phase, num so far: {}'.format( self.task_index, self.num_so_far)) effective_epochs = 1.0 * self.num_so_far / num_total epoch_previous = self.epoch self.epoch = effective_epochs g.write_unique( # TODO(KGF): "a total of X epochs within this session" ? '\nFinished training epoch {:.2f} '.format(self.epoch) # TODO(KGF): "precisely/exactly X epochs just passed"? + 'during this session ({:.2f} epochs passed)'.format(self.epoch - epoch_previous) # '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format( # 1.0 * self.epoch, self.epoch - epoch_previous) + ' in {:.2f} seconds\n'.format(t2 - t_start)) return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)
def train_epoch(self): ''' The purpose of the method is to perform distributed mini-batch SGD for one epoch. It takes the batch iterator function and a NN model from MPIModel object, fetches mini-batches in a while-loop until number of samples seen by the ensemble of workers (num_so_far) exceeds the training dataset size (num_total). During each iteration, the gradient updates (deltas) and the loss are calculated for each model replica in the ensemble, weights are averaged over ensemble, and the new weights are set. It performs calls to: MPIModel.get_deltas, MPIModel.set_new_weights methods Argument list: Empty Returns: - step: epoch number - ave_loss: training loss averaged over replicas - curr_loss: - num_so_far: the number of samples seen by ensemble of replicas to a current epoch (step) Intermediate outputs and logging: debug printout of task_index (MPI), epoch number, number of samples seen to a current epoch, average training loss ''' verbose = False first_run = True step = 0 loss_averager = Averager() t_start = time.time() batch_iterator_func = self.batch_iterator_func num_total = 1 ave_loss = -1 curr_loss = -1 t0 = 0 t1 = 0 t2 = 0 while ((self.num_so_far - self.epoch * num_total) < num_total or step < self.num_batches_minimum): try: (batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period) = next(batch_iterator_func) except StopIteration: g.print_unique("Resetting batch iterator.") self.num_so_far_accum = self.num_so_far_indiv self.set_batch_iterator_func() batch_iterator_func = self.batch_iterator_func (batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period) = next(batch_iterator_func) self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr # if batches_to_reset: # self.model.reset_states(batches_to_reset) warmup_phase = (step < self.warmup_steps and self.epoch == 0) num_replicas = 1 if warmup_phase else self.num_replicas self.num_so_far = self.mpi_sum_scalars(self.num_so_far_indiv, num_replicas) # run the model once to force compilation. Don't actually use these # values. if first_run: first_run = False t0_comp = time.time() _, _ = self.train_on_batch_and_get_deltas( batch_xs, batch_ys, verbose) self.comm.Barrier() sys.stdout.flush() # TODO(KGF): check line feed/carriage returns around this g.print_unique( '\nCompilation finished in {:.2f}s'.format(time.time() - t0_comp)) t_start = time.time() sys.stdout.flush() if np.any(batches_to_reset): reset_states(self.model, batches_to_reset) t0 = time.time() deltas, loss = self.train_on_batch_and_get_deltas( batch_xs, batch_ys, verbose) t1 = time.time() if not is_warmup_period: self.set_new_weights(deltas, num_replicas) t2 = time.time() write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas) curr_loss = self.mpi_average_scalars(1.0 * loss, num_replicas) # g.print_unique(self.model.get_weights()[0][0][:4]) loss_averager.add_val(curr_loss) ave_loss = loss_averager.get_val() eta = self.estimate_remaining_time( t0 - t_start, self.num_so_far - self.epoch * num_total, num_total) write_str = ( '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format( self.task_index, step, eta, 1.0 * self.num_so_far, num_total) + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss) + 'walltime: {:.4f} | '.format(time.time() - self.start_time)) g.write_unique(write_str + write_str_0) step += 1 else: g.write_unique('\r[{}] warmup phase, num so far: {}'.format( self.task_index, self.num_so_far)) effective_epochs = 1.0 * self.num_so_far / num_total epoch_previous = self.epoch self.epoch = effective_epochs g.write_unique( # TODO(KGF): "a total of X epochs within this session" ? '\nFinished training epoch {:.2f} '.format(self.epoch) # TODO(KGF): "precisely/exactly X epochs just passed"? + 'during this session ({:.2f} epochs passed)'.format(self.epoch - epoch_previous) # '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format( # 1.0 * self.epoch, self.epoch - epoch_previous) + ' in {:.2f} seconds\n'.format(t2 - t_start)) return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)