예제 #1
0
 def __init__(self, loss, metrics, batch_size):
     self.batch_size = batch_size * mc.get_nranks()
     self.local_batch_size = batch_size
     self.both = [loss, metrics['accuracy'][0]]
     self.samps = 0
     self.perf = 0
     self.step = 0
     self.loss = np.asarray([0.], dtype='float32')
     self.acc = np.asarray([0.], dtype='float32')
     self.start_time = None
예제 #2
0
 def __init__(self, loss, metrics, log_freq, batch_size, lr, epoch):
     self.log_freq = log_freq
     self.lr = lr
     self.epoch_true = 0
     self.epoch = epoch
     self.batch_size = batch_size * mc.get_nranks()
     self.both = [loss, metrics['accuracy'][0], lr, epoch]
     self.samps = 0.
     self.perf = 0.
     self.step = 0
     self.sums = 0
     self.start_time = None
예제 #3
0
    def learning_rate_fn(global_step):

        global_step = tf.cast(global_step, tf.float32)
        #cstep = global_step*num_images/eff_batch_size
        epoch = (d_steps + w_steps) / (global_step + 1)

        total_epochs = decay_epochs + warmup_epochs

        tf.Print(epoch, [epoch], "Epoch: ")
        #current_step = tf.Print(cstep, [cstep], "Current train steps so far: ")
        if (mlcomm == 1):
            current_lr = learning_rate_0 * math.pow(
                1.0 - (decay_steps - warmup_steps) / decay_steps, 2)
            if (mc.get_rank() == 0):
                print("Using Cray learning_rate_warmup_poly_decay(): ")
                print(" -> effective batch size: ", eff_batch_size)
                print(" -> batches per epoch: ", batches_per_epoch)

                print(" -> initial learning rate: ", learning_rate_0)
                print(" -> learning rate base: ", learning_rate_base)
                print(" -> starting global learning rate at first epoch: ",
                      current_lr)
                print(" -> decay after ", decay_epochs, " epochs")
                print("     -> decay steps: ", decay_steps)
                print(" -> warmup with ", warmup_epochs, " epochs")
                print("     -> warmup steps: ", warmup_steps)
                print(" -> number workers: ", mc.get_nranks())
                #print(" -> Finished Epoch: ", tf.get_session_tensor(epoch), "/", total_epochs)

        #global_step = tf.cast(global_step, tf.float32)
        def lr_warmup():
            return (lr_0 + global_step * (lr_base - lr_0) / w_steps)

        def lr_poly():
            return (lr_base * math_ops.pow(
                (1 - (global_step - w_steps) / d_steps), 2))

        return tf.cond(tf.less(global_step, warmup_steps), lambda: lr_warmup(),
                       lambda: lr_poly())
예제 #4
0
def resnet_main(flags,
                model_function,
                input_function,
                num_train_samps,
                num_eval_samps,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.

    myrank = 0
    numworkers = 1
    if (flags.enable_ml_comm == 1):

        # initialize the Cray PE ML Plugin
        # config the thread team (correcting the number of epochs for the effectice batch size))
        #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()])

        totsize = 25551401  #Specific size for resnet50-v2
        mc.init(2, 1, totsize, "tensorflow")
        myrank = mc.get_rank()
        numworkers = mc.get_nranks()
        if (myrank == 0):
            print("ResNet with {:9d} parameters".format(totsize))

        max_steps_train = int(
            math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) /
                      (mc.get_nranks() * flags.batch_size)))
        #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100)
        mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100)

        flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None
        flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank(
        ) == 0 else None
        flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None

    else:
        rank_id = myrank

    session_config = tf.ConfigProto(
        log_device_placement=False,
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_steps=500, session_config=session_config)

    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'train_epochs': flags.train_epochs,
                                            'version': flags.version,
                                            'loss_scale': flags.loss_scale,
                                            'dtype': flags.dtype,
                                            'mlcomm': flags.enable_ml_comm,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                            'weight_decay': flags.weight_decay,
                                            'init_lr': flags.init_lr,
                                            'base_lr': flags.base_lr,
                                            'warmup_epochs':
                                            flags.warmup_epochs,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                        })

    benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
    benchmark_logger.log_run_info('resnet')

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)
        if (myrank == 0):
            print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        tsteps = math.ceil(
            float(flags.epochs_between_evals * num_train_samps) /
            (numworkers * flags.batch_size))
        classifier.train(input_fn=input_fn_train,
                         steps=tsteps,
                         max_steps=flags.max_train_steps)

        if (myrank == 0):
            print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 3,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        esteps = math.ceil(
            float(num_eval_samps) / (numworkers * flags.batch_size))
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=esteps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)

    if (flags.enable_ml_comm == 1):
        mc.finalize()
예제 #5
0
def size():
    return mc.get_nranks()
예제 #6
0
 def multiplier(epoch):
     # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
     # learning rate graphs look better.
     epoch += 1. / self.steps_per_epoch
     return 1. / mc.get_nranks() * (epoch * (mc.get_nranks() - 1) / warmup_epochs + 1)
예제 #7
0
    def train(self):
        train_step, loss, lossL1Train,train_true,train_predict = self.optimize()
        lossL1Val,val_true,val_predict = self.validation_loss()
        lossL1Test,test_true,test_predict = self.test_loss()
        
	config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.4
 
        ### taking config from the MKL benchmarks. 
        config.allow_soft_placement = True
        config.intra_op_parallelism_threads = 1 ## default
        config.inter_op_parallelism_threads = 2 ## Default

        #used to save the model
	saver = tf.train.Saver()
        global best_validation_accuracy
        global last_improvement
        global total_iterations
	best_validation_accuracy = 1.0         #Best validation accuracy seen so far
	last_improvement = 0                   #Iteration-number for last improvement to validation accuracy.
	require_improvement = hp.RUNPARAM['require_improvement']               #Stop optimization if no improvement found in this many iterations.
        total_iterations = 0                   #Counter for total number of iterations performed so far.        

        #initialize the CPE ML Plugin with one team (single thread for now) and the model size
        totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()])
        mc.init(1, 1, totsize, "tensorflow")
        hp.RUNPARAM['batch_per_epoch'] = hp.RUNPARAM['batch_per_epoch'] / mc.get_nranks()
        hp.RUNPARAM['batch_per_epoch_val'] = hp.RUNPARAM['batch_per_epoch_val'] / mc.get_nranks()
        totsteps = hp.RUNPARAM['num_epoch'] * hp.RUNPARAM['batch_per_epoch']
        mc.config_team(0, 0, totsteps, totsteps, 2, 50)

        if (mc.get_rank() == 0):
            print("+------------------------------+")
            print("| CosmoFlow                    |")
            print("| # Ranks = {:5d}              |".format(mc.get_nranks()))
            print("| Global Batch = {:6d}        |".format(mc.get_nranks() * hp.Input['BATCH_SIZE']))
            print("| # Parameters = {:9d}     |".format(totsize))
            print("+------------------------------+")

        #use the CPE ML Plugin to broadcast initial model parameter values
        new_vars = mc.broadcast(tf.trainable_variables(),0)
        bcast    = tf.group(*[tf.assign(v,new_vars[k]) for k,v in enumerate(tf.trainable_variables())])

	if(self.is_train):
            with tf.Session(config=config) as sess:
        	losses_train = []  
        	losses_val = []
        	losses = []
		val_accuracys = []       
		data_accuracys = []   

                #do all parameter initializations
		sess.run(tf.global_variables_initializer())
		sess.run(tf.local_variables_initializer())
                sess.run(bcast)
		
        	coord = tf.train.Coordinator()
        	threads = tf.train.start_queue_runners(coord=coord)

                elapsed_time = 0.
		for epoch in range(hp.RUNPARAM['num_epoch']):
			save_path = os.path.join(hp.Path['Model_path'], 'best_validation')
			total_iterations += 1
			start_time = time.time()
        	        loss_per_epoch_val = 0
        	        loss_per_epoch_train = 0
        	        for i in range(hp.RUNPARAM['batch_per_epoch']): 
				step_start_time = time.time()
				_,lossTrain,lossL1Train_,train_true_,train_predict_ = sess.run([train_step,loss,lossL1Train,train_true,train_predict])
                                step_finish_time = time.time()
				
                                elapsed_time += (step_finish_time-step_start_time)
                                samps_per_sec = mc.get_nranks() * (epoch * hp.RUNPARAM['batch_per_epoch'] * hp.Input['BATCH_SIZE'] + (i+1) * hp.Input['BATCH_SIZE']) / elapsed_time
                                if (mc.get_rank() == 0):
                                  print("Train Step: " + str(i) + ", Samples/Sec = " + str(samps_per_sec) + ", Loss = " + str(lossTrain))
                        
        	                loss_per_epoch_train +=lossL1Train_

                        global_loss = np.array([loss_per_epoch_train],dtype=np.float32)
                        mc.average(global_loss)
                        loss_per_epoch_train = global_loss / hp.RUNPARAM['batch_per_epoch']
        	        losses.append(loss_per_epoch_train)
			losses_train.append(loss_per_epoch_train)
			
			for i in range(hp.RUNPARAM['batch_per_epoch_val']):
                                if (mc.get_rank() == 0):
                                  print("Val Step = " + str(i))
				loss_,val_true_,val_predict_ = sess.run([lossL1Val,val_true,val_predict])
                                loss_per_epoch_val += loss_

                        global_loss = np.array([loss_per_epoch_val],dtype=np.float32)
                        mc.average(global_loss)
                        loss_per_epoch_val = global_loss / hp.RUNPARAM['batch_per_epoch_val']
			losses_val.append(loss_per_epoch_val)

        	        if(loss_per_epoch_val < best_validation_accuracy):
				best_validation_accuracy  = loss_per_epoch_val
				last_improvement = total_iterations
				if (mc.get_rank() == 0):
					saver.save(sess=sess, save_path=save_path)

			if (mc.get_rank() == 0):
				print("Epoch {} took {:.3f}s".format(epoch, time.time() - start_time))
				print "  training loss: %.3f" %(loss_per_epoch_train)
				print "  validation loss: %.3f" %(loss_per_epoch_val)
				print "  best loss: %.3f"%best_validation_accuracy	
				np.savetxt(os.path.join(hp.Path['train_result'],'loss_train.txt'),losses_train)
				np.savetxt(os.path.join(hp.Path['val_result'],'loss_val.txt'),losses_val)
				np.savetxt(os.path.join(hp.Path['train_result'],'losses.txt'),losses)
		                #np.savetxt(os.path.join(hp.Path['train_result'],'train_pred'+str(epoch)+'.txt'),np.c_[train_true_,train_predict_])
        	                #np.savetxt(os.path.join(hp.Path['val_result'],'val_pred'+str(epoch)+'.txt'),np.c_[val_true_,val_predict_])
			if(total_iterations - last_improvement > require_improvement):
				if (mc.get_rank() == 0):
					print ("No improvement found in a while, stopping optimization.")
				break		                        

		coord.request_stop();
                coord.join(threads);

	if(self.is_test and mc.get_rank() == 0):
               
		save_path = os.path.join(hp.Path['Model_path'], 'best_validation')
		if self.save_path != None:
		    save_path = self.save_path

		with tf.Session() as sess:
	    		saver.restore(sess=sess,save_path=save_path)
			coord = tf.train.Coordinator()
                	threads = tf.train.start_queue_runners(coord=coord)
            		loss_test = []
            		for i in range(0,hp.RUNPARAM['iter_test']):
				start_time = time.time()
		    		lossL1Test_,test_true_,test_predict_ = sess.run([lossL1Test,test_true,test_predict])
		    		loss_test.append(lossL1Test_)	
				print("Box {} took {:.3f}s".format(i, time.time() - start_time))
				print "  test loss: %.3f"%lossL1Test_
	    		        np.savetxt(os.path.join(hp.Path['test_result'],'test_batch_'+str(i)+'.txt'),np.c_[test_true_,test_predict_])
	    		np.savetxt(os.path.join(hp.Path['test_result'],'loss_test.txt'),loss_test)
                	coord.request_stop()
			coord.join(threads)

        #cleanup the CPE ML Plugin
        mc.finalize()