def __init__(self, loss, metrics, batch_size): self.batch_size = batch_size * mc.get_nranks() self.local_batch_size = batch_size self.both = [loss, metrics['accuracy'][0]] self.samps = 0 self.perf = 0 self.step = 0 self.loss = np.asarray([0.], dtype='float32') self.acc = np.asarray([0.], dtype='float32') self.start_time = None
def __init__(self, loss, metrics, log_freq, batch_size, lr, epoch): self.log_freq = log_freq self.lr = lr self.epoch_true = 0 self.epoch = epoch self.batch_size = batch_size * mc.get_nranks() self.both = [loss, metrics['accuracy'][0], lr, epoch] self.samps = 0. self.perf = 0. self.step = 0 self.sums = 0 self.start_time = None
def learning_rate_fn(global_step): global_step = tf.cast(global_step, tf.float32) #cstep = global_step*num_images/eff_batch_size epoch = (d_steps + w_steps) / (global_step + 1) total_epochs = decay_epochs + warmup_epochs tf.Print(epoch, [epoch], "Epoch: ") #current_step = tf.Print(cstep, [cstep], "Current train steps so far: ") if (mlcomm == 1): current_lr = learning_rate_0 * math.pow( 1.0 - (decay_steps - warmup_steps) / decay_steps, 2) if (mc.get_rank() == 0): print("Using Cray learning_rate_warmup_poly_decay(): ") print(" -> effective batch size: ", eff_batch_size) print(" -> batches per epoch: ", batches_per_epoch) print(" -> initial learning rate: ", learning_rate_0) print(" -> learning rate base: ", learning_rate_base) print(" -> starting global learning rate at first epoch: ", current_lr) print(" -> decay after ", decay_epochs, " epochs") print(" -> decay steps: ", decay_steps) print(" -> warmup with ", warmup_epochs, " epochs") print(" -> warmup steps: ", warmup_steps) print(" -> number workers: ", mc.get_nranks()) #print(" -> Finished Epoch: ", tf.get_session_tensor(epoch), "/", total_epochs) #global_step = tf.cast(global_step, tf.float32) def lr_warmup(): return (lr_0 + global_step * (lr_base - lr_0) / w_steps) def lr_poly(): return (lr_base * math_ops.pow( (1 - (global_step - w_steps) / d_steps), 2)) return tf.cond(tf.less(global_step, warmup_steps), lambda: lr_warmup(), lambda: lr_poly())
def resnet_main(flags, model_function, input_function, num_train_samps, num_eval_samps, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. myrank = 0 numworkers = 1 if (flags.enable_ml_comm == 1): # initialize the Cray PE ML Plugin # config the thread team (correcting the number of epochs for the effectice batch size)) #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()]) totsize = 25551401 #Specific size for resnet50-v2 mc.init(2, 1, totsize, "tensorflow") myrank = mc.get_rank() numworkers = mc.get_nranks() if (myrank == 0): print("ResNet with {:9d} parameters".format(totsize)) max_steps_train = int( math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) / (mc.get_nranks() * flags.batch_size))) #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100) mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100) flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank( ) == 0 else None flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None else: rank_id = myrank session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_steps=500, session_config=session_config) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'train_epochs': flags.train_epochs, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype, 'mlcomm': flags.enable_ml_comm, 'log_freq': flags.global_perf_log_freq, 'weight_decay': flags.weight_decay, 'init_lr': flags.init_lr, 'base_lr': flags.base_lr, 'warmup_epochs': flags.warmup_epochs, 'log_freq': flags.global_perf_log_freq, }) benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) if (myrank == 0): print('Starting a training cycle.') def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) tsteps = math.ceil( float(flags.epochs_between_evals * num_train_samps) / (numworkers * flags.batch_size)) classifier.train(input_fn=input_fn_train, steps=tsteps, max_steps=flags.max_train_steps) if (myrank == 0): print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 3, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. esteps = math.ceil( float(num_eval_samps) / (numworkers * flags.batch_size)) eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=esteps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn) if (flags.enable_ml_comm == 1): mc.finalize()
def size(): return mc.get_nranks()
def multiplier(epoch): # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard # learning rate graphs look better. epoch += 1. / self.steps_per_epoch return 1. / mc.get_nranks() * (epoch * (mc.get_nranks() - 1) / warmup_epochs + 1)
def train(self): train_step, loss, lossL1Train,train_true,train_predict = self.optimize() lossL1Val,val_true,val_predict = self.validation_loss() lossL1Test,test_true,test_predict = self.test_loss() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 ### taking config from the MKL benchmarks. config.allow_soft_placement = True config.intra_op_parallelism_threads = 1 ## default config.inter_op_parallelism_threads = 2 ## Default #used to save the model saver = tf.train.Saver() global best_validation_accuracy global last_improvement global total_iterations best_validation_accuracy = 1.0 #Best validation accuracy seen so far last_improvement = 0 #Iteration-number for last improvement to validation accuracy. require_improvement = hp.RUNPARAM['require_improvement'] #Stop optimization if no improvement found in this many iterations. total_iterations = 0 #Counter for total number of iterations performed so far. #initialize the CPE ML Plugin with one team (single thread for now) and the model size totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()]) mc.init(1, 1, totsize, "tensorflow") hp.RUNPARAM['batch_per_epoch'] = hp.RUNPARAM['batch_per_epoch'] / mc.get_nranks() hp.RUNPARAM['batch_per_epoch_val'] = hp.RUNPARAM['batch_per_epoch_val'] / mc.get_nranks() totsteps = hp.RUNPARAM['num_epoch'] * hp.RUNPARAM['batch_per_epoch'] mc.config_team(0, 0, totsteps, totsteps, 2, 50) if (mc.get_rank() == 0): print("+------------------------------+") print("| CosmoFlow |") print("| # Ranks = {:5d} |".format(mc.get_nranks())) print("| Global Batch = {:6d} |".format(mc.get_nranks() * hp.Input['BATCH_SIZE'])) print("| # Parameters = {:9d} |".format(totsize)) print("+------------------------------+") #use the CPE ML Plugin to broadcast initial model parameter values new_vars = mc.broadcast(tf.trainable_variables(),0) bcast = tf.group(*[tf.assign(v,new_vars[k]) for k,v in enumerate(tf.trainable_variables())]) if(self.is_train): with tf.Session(config=config) as sess: losses_train = [] losses_val = [] losses = [] val_accuracys = [] data_accuracys = [] #do all parameter initializations sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(bcast) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) elapsed_time = 0. for epoch in range(hp.RUNPARAM['num_epoch']): save_path = os.path.join(hp.Path['Model_path'], 'best_validation') total_iterations += 1 start_time = time.time() loss_per_epoch_val = 0 loss_per_epoch_train = 0 for i in range(hp.RUNPARAM['batch_per_epoch']): step_start_time = time.time() _,lossTrain,lossL1Train_,train_true_,train_predict_ = sess.run([train_step,loss,lossL1Train,train_true,train_predict]) step_finish_time = time.time() elapsed_time += (step_finish_time-step_start_time) samps_per_sec = mc.get_nranks() * (epoch * hp.RUNPARAM['batch_per_epoch'] * hp.Input['BATCH_SIZE'] + (i+1) * hp.Input['BATCH_SIZE']) / elapsed_time if (mc.get_rank() == 0): print("Train Step: " + str(i) + ", Samples/Sec = " + str(samps_per_sec) + ", Loss = " + str(lossTrain)) loss_per_epoch_train +=lossL1Train_ global_loss = np.array([loss_per_epoch_train],dtype=np.float32) mc.average(global_loss) loss_per_epoch_train = global_loss / hp.RUNPARAM['batch_per_epoch'] losses.append(loss_per_epoch_train) losses_train.append(loss_per_epoch_train) for i in range(hp.RUNPARAM['batch_per_epoch_val']): if (mc.get_rank() == 0): print("Val Step = " + str(i)) loss_,val_true_,val_predict_ = sess.run([lossL1Val,val_true,val_predict]) loss_per_epoch_val += loss_ global_loss = np.array([loss_per_epoch_val],dtype=np.float32) mc.average(global_loss) loss_per_epoch_val = global_loss / hp.RUNPARAM['batch_per_epoch_val'] losses_val.append(loss_per_epoch_val) if(loss_per_epoch_val < best_validation_accuracy): best_validation_accuracy = loss_per_epoch_val last_improvement = total_iterations if (mc.get_rank() == 0): saver.save(sess=sess, save_path=save_path) if (mc.get_rank() == 0): print("Epoch {} took {:.3f}s".format(epoch, time.time() - start_time)) print " training loss: %.3f" %(loss_per_epoch_train) print " validation loss: %.3f" %(loss_per_epoch_val) print " best loss: %.3f"%best_validation_accuracy np.savetxt(os.path.join(hp.Path['train_result'],'loss_train.txt'),losses_train) np.savetxt(os.path.join(hp.Path['val_result'],'loss_val.txt'),losses_val) np.savetxt(os.path.join(hp.Path['train_result'],'losses.txt'),losses) #np.savetxt(os.path.join(hp.Path['train_result'],'train_pred'+str(epoch)+'.txt'),np.c_[train_true_,train_predict_]) #np.savetxt(os.path.join(hp.Path['val_result'],'val_pred'+str(epoch)+'.txt'),np.c_[val_true_,val_predict_]) if(total_iterations - last_improvement > require_improvement): if (mc.get_rank() == 0): print ("No improvement found in a while, stopping optimization.") break coord.request_stop(); coord.join(threads); if(self.is_test and mc.get_rank() == 0): save_path = os.path.join(hp.Path['Model_path'], 'best_validation') if self.save_path != None: save_path = self.save_path with tf.Session() as sess: saver.restore(sess=sess,save_path=save_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) loss_test = [] for i in range(0,hp.RUNPARAM['iter_test']): start_time = time.time() lossL1Test_,test_true_,test_predict_ = sess.run([lossL1Test,test_true,test_predict]) loss_test.append(lossL1Test_) print("Box {} took {:.3f}s".format(i, time.time() - start_time)) print " test loss: %.3f"%lossL1Test_ np.savetxt(os.path.join(hp.Path['test_result'],'test_batch_'+str(i)+'.txt'),np.c_[test_true_,test_predict_]) np.savetxt(os.path.join(hp.Path['test_result'],'loss_test.txt'),loss_test) coord.request_stop() coord.join(threads) #cleanup the CPE ML Plugin mc.finalize()