def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def test_horovod_allreduce_cpu(self): """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() with self.test_session() as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce produces incorrect results")
def test_horovod_allreduce_grad(self): """Test the correctness of the allreduce gradient.""" hvd.init() size = hvd.size() with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform( [5] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) grad_ys = tf.ones([5] * dim) grad = tf.gradients(summed, tensor, grad_ys)[0] grad_out = session.run(grad) expected = np.ones([5] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0" one_gpu = tf.GPUOptions(visible_device_list=str(local_rank)) gpu_config = tf.ConfigProto(gpu_options=one_gpu) with self.test_session(config=gpu_config) as session: with tf.device(device): # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def allreduce(value, name=None, average=True): """ Perform an allreduce on a tensor-compatible value. Arguments: value: A tensor-compatible value to reduce. The shape of the input must be identical across all ranks. name: Optional name for the constants created by this operation. average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. """ allreduce_op = hvd.allreduce(tf.constant(value, name=name), average=average) return K.get_session().run(allreduce_op)
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 two_gpus = tf.GPUOptions(visible_device_list=( '%d,%d' % (local_rank * 2, local_rank * 2 + 1))) gpu_config = tf.ConfigProto(gpu_options=two_gpus) with self.test_session(config=gpu_config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 with tf.device("/gpu:%d" % ((iter + local_rank) % 2)): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce on GPU produces incorrect results")
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def test_horovod_allreduce_gpu_fused(self): """Test that the allreduce works on GPUs with Tensor Fusion. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] tests = [] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return test = max_difference <= threshold tests.append(test) self.assertTrue(session.run(tf.reduce_all(tests)), "hvd.allreduce produces incorrect results")
def get_apply_grads_op(self, loss, var_list): """ :param tf.Tensor loss: :param list[tf.Variable] var_list: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ # The following code is basically extended self.optimizer.minimize(), to optionally modify gradients. from Util import make_hashable if not var_list: return tf.no_op(name="no_grad_vars_no_op") grads_and_vars = self._compute_gradients(loss, var_list=var_list) if self.config.is_true("use_horovod") and self.config.value("horovod_reduce_type", "") == "grad": # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd grads_and_vars = [ (hvd.allreduce(grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var) for (grad, var) in grads_and_vars] var_grads = {var: grad for (grad, var) in grads_and_vars if grad is not None} if not var_grads: raise Exception("no single variable to train") global_info = self._GetGlobalInfo(optimizer=self, all_vars=var_list, var_grads=var_grads) if self.config.bool_or_other("debug_grad_summaries", False): tf.summary.scalar("global_grad_norm", global_info.get_global_grad_norm()) grads_per_apply_grad_opts = {} # dict apply_grad_opts -> list of (grad, var) for grad, var in grads_and_vars: assert var in var_list if grad is None: continue new_grad, apply_grad_opts = self._post_process_grad(grad=grad, var=var, global_info=global_info) grads_per_apply_grad_opts.setdefault(make_hashable(apply_grad_opts), []).append((new_grad, var)) all_apply_grads = [] assert grads_per_apply_grad_opts for apply_grad_opts, grads_and_vars_per_opts in grads_per_apply_grad_opts.items(): all_apply_grads.append(self._apply_gradients(grads_and_vars_per_opts, **apply_grad_opts)) if len(all_apply_grads) == 1: return all_apply_grads[0] return tf.group(*all_apply_grads)
def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) from horovod.common import size from horovod.tensorflow import allreduce if size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Allreduce"): for grad, var in gradients: if grad is not None: avg_grad = allreduce(grad, device_dense=self._device_dense, device_sparse=self._device_sparse) averaged_gradients.append((avg_grad, var)) else: averaged_gradients.append((None, var)) return averaged_gradients else: return gradients
def get_gradients(self, loss, params): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = super(self.__class__, self).get_gradients(loss, params) if hvd.size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Allreduce"): for grad in gradients: if grad is not None: avg_grad = hvd.allreduce(grad, device_dense=self._device_dense, device_sparse=self._device_sparse) averaged_gradients.append(avg_grad) else: averaged_gradients.append(None) return averaged_gradients else: return gradients
def allreduce(model, opt, gradient_accumulator, loss, mlm_loss, mlm_acc, sop_loss, sop_acc): grads = gradient_accumulator.gradients # This, which is equivalent to sparse_as_dense=True, gives a mild 2% speedup from 0.62 it/s to 0.63 it/s # onn BERT-large multinode. grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] # TODO: Does placing this clip before or after allreduce affect accuracy? # Placing before has a regularization effect, no single example can contribute as much. # Placing before also gives a 20% speedup when training BERT-large, probably because the # gradient operations can be fused by XLA. (grads, grad_norm) = tf.clip_by_global_norm(grads, clip_norm=args.max_grad_norm) grads = [ hvd.allreduce(grad, compression=hvd.Compression.fp16) if grad is not None else None for grad in grads ] opt.apply_gradients([ (grad, var) for (grad, var) in zip(grads, model.trainable_variables) if grad is not None ]) # Clear the gradient accumulator gradient_accumulator.reset() loss = hvd.allreduce(loss) mlm_loss = hvd.allreduce(mlm_loss) mlm_acc = hvd.allreduce(mlm_acc) sop_loss = hvd.allreduce(sop_loss) sop_acc = hvd.allreduce(sop_acc) return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm
def main(): hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.threading.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads tf.config.threading.inter_op_parallelism_threads = max( 2, 40 // hvd.size() - 2) os.environ['TF_CUDNN_DETERMINISTIC'] = '1' cmdline = add_cli_args() FLAGS, unknown_args = cmdline.parse_known_args() if FLAGS.fine_tune: raise NotImplementedError('fine tuning functionality not available') if not FLAGS.xla_off: tf.config.optimizer.set_jit(True) if not FLAGS.fp32: tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": True}) preprocessing_type = 'resnet' if FLAGS.model == 'resnet50v1_b': model = resnet.ResNet50V1_b(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'resnet50v1_c': model = resnet.ResNet50V1_c(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'resnet50v1_d': model = resnet.ResNet50V1_d(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'resnet101v1_b': model = resnet.ResNet101V1_b(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'resnet101v1_c': model = resnet.ResNet101V1_c(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'resnet101v1_d': model = resnet.ResNet101V1_d(weights=None, weight_decay=FLAGS.l2_weight_decay, classes=FLAGS.num_classes) elif FLAGS.model == 'darknet53': model = darknet.Darknet(weight_decay=FLAGS.l2_weight_decay) elif FLAGS.model in ['hrnet_w18c', 'hrnet_w32c']: preprocessing_type = 'imagenet' model = hrnet.build_hrnet(FLAGS.model) model._set_inputs(tf.keras.Input(shape=(None, None, 3))) else: raise NotImplementedError('Model {} not implemented'.format( FLAGS.model)) model.summary() # scale learning rate linearly, base learning rate for batch size of 256 is specified through args BASE_LR = FLAGS.learning_rate learning_rate = (BASE_LR * hvd.size() * FLAGS.batch_size) / 256 steps_per_epoch = int( (FLAGS.train_dataset_size / (FLAGS.batch_size * hvd.size()))) # 5 epochs are for warmup if FLAGS.schedule == 'piecewise_short': scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=[ steps_per_epoch * 25, steps_per_epoch * 55, steps_per_epoch * 75, step_per_epoch * 100 ], values=[ learning_rate, learning_rate * 0.1, learning_rate * 0.01, learning_rate * 0.001, learning_rate * 0.0001 ]) elif FLAGS.schedule == 'piecewise_long': scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=[ steps_per_epoch * 55, steps_per_epoch * 115, steps_per_epoch * 175 ], values=[ learning_rate, learning_rate * 0.1, learning_rate * 0.01, learning_rate * 0.001 ]) elif FLAGS.schedule == 'cosine': scheduler = tf.keras.experimental.CosineDecayRestarts( initial_learning_rate=learning_rate, first_decay_steps=FLAGS.num_epochs * steps_per_epoch, t_mul=1, m_mul=1) else: print('No schedule specified') scheduler = WarmupScheduler(optimizer=scheduler, initial_learning_rate=learning_rate / hvd.size(), warmup_steps=steps_per_epoch * 5) #TODO support optimizers choice via config # opt = tf.keras.optimizers.SGD(learning_rate=scheduler, momentum=FLAGS.momentum, nesterov=True) # needs momentum correction term opt = MomentumOptimizer(learning_rate=scheduler, momentum=FLAGS.momentum, nesterov=True) if not FLAGS.fp32: opt = tf.train.experimental.enable_mixed_precision_graph_rewrite( opt, loss_scale=128.) loss_func = tf.keras.losses.CategoricalCrossentropy( from_logits=True, label_smoothing=FLAGS.label_smoothing, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE) if hvd.rank() == 0: if FLAGS.resume_from: model = tf.keras.models.load_model(FLAGS.resume_from) print('loaded model from', FLAGS.resume_from) model_dir = os.path.join( FLAGS.model + datetime.datetime.now().strftime("_%Y-%m-%d_%H-%M-%S")) path_logs = os.path.join(os.getcwd(), model_dir, 'log.csv') os.mkdir(model_dir) logging.basicConfig( filename=path_logs, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) logging.info("Training Logs") logger = logging.getLogger('logger') logger.info('Training options: %s', FLAGS) # barrier hvd.allreduce(tf.constant(0)) start_time = time() curr_step = tf.Variable(initial_value=0, dtype=tf.int32) best_validation_accuracy = 0.7 # only save 0.7 or higher checkpoints data = create_dataset(FLAGS.train_data_dir, FLAGS.batch_size, preprocessing=preprocessing_type, validation=False) validation_data = create_dataset(FLAGS.validation_data_dir, FLAGS.batch_size, preprocessing=preprocessing_type, validation=True) for epoch in range(FLAGS.num_epochs): if hvd.rank() == 0: print('Starting training Epoch %d/%d' % (epoch, FLAGS.num_epochs)) training_score = 0 for batch, (images, labels) in enumerate(tqdm(data)): # momentum correction (V2 SGD absorbs LR into the update term) # prev_lr = opt._optimizer.learning_rate(curr_step-1) # curr_lr = opt._optimizer.learning_rate(curr_step) # momentum_correction_factor = curr_lr / prev_lr # opt._optimizer.momentum = opt._optimizer.momentum * momentum_correction_factor loss, score = train_step(model, opt, loss_func, images, labels, batch == 0 and epoch == 0, batch_size=FLAGS.batch_size, mixup_alpha=FLAGS.mixup_alpha, fp32=FLAGS.fp32) # # restore momentum # opt._optimizer.momentum = FLAGS.momentum training_score += score.numpy() curr_step.assign_add(1) training_accuracy = training_score / (FLAGS.batch_size * (batch + 1)) average_training_accuracy = hvd.allreduce( tf.constant(training_accuracy)) average_training_loss = hvd.allreduce(tf.constant(loss)) if hvd.rank() == 0: print('Starting validation Epoch %d/%d' % (epoch, FLAGS.num_epochs)) validation_score = 0 counter = 0 for images, labels in tqdm(validation_data): loss, score = validation_step(images, labels, model, loss_func) validation_score += score.numpy() counter += 1 validation_accuracy = validation_score / (FLAGS.batch_size * counter) average_validation_accuracy = hvd.allreduce( tf.constant(validation_accuracy)) average_validation_loss = hvd.allreduce(tf.constant(loss)) if hvd.rank() == 0: info_str = 'Epoch: %d, Train Accuracy: %f, Train Loss: %f, Validation Accuracy: %f, Validation Loss: %f LR:%f' % ( epoch, average_training_accuracy, average_training_loss, average_validation_accuracy, average_validation_loss, scheduler(curr_step)) print(info_str) logger.info(info_str) if average_validation_accuracy > best_validation_accuracy: logger.info("Found new best accuracy, saving checkpoint ...") best_validation_accuracy = average_validation_accuracy model.save('{}/{}'.format(FLAGS.model_dir, FLAGS.model)) if hvd.rank() == 0: logger.info('Total Training Time: %f' % (time() - start_time))
def _make_variable(self, metric, value): with tf.name_scope('MetricAverageCallback'): var = tf.Variable(value, name=metric) K.get_session().run(var.initializer) allreduce_op = hvd.allreduce(var, device_dense=self.device) return var, allreduce_op
def __init__(self, league_mgr_addr, model_pool_addrs, learner_ports, rm_size, batch_size, ob_space, ac_space, policy, gpu_id, policy_config={}, ent_coef=1e-2, distill_coef=1e-2, vf_coef=0.5, max_grad_norm=0.5, rwd_shape=False, pub_interval=500, log_interval=100, save_interval=0, total_timesteps=5e7, burn_in_timesteps=0, learner_id='', batch_worker_num=4, pull_worker_num=2, unroll_length=32, rollout_length=1, use_mixed_precision=False, use_sparse_as_dense=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-5, data_type=PGData, data_server_version='v1', decode=False, log_infos_interval=20, **kwargs): super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs, learner_ports, learner_id) self.LR = tf.placeholder(tf.float32, []) """Learning Rate""" self.CLIPRANGE = tf.placeholder(tf.float32, []) """Learning Rate Clip Range""" self.ep_loss_coef = {} """Coefficients for those losses from the endpoints. Override it in derived class.""" # TODO(pengsun): fix the policy_config default value self._init_const(total_timesteps, burn_in_timesteps, batch_size, unroll_length, rwd_shape, ent_coef, vf_coef, pub_interval, log_interval, save_interval, policy, distill_coef, policy_config, rollout_length) # allow_soft_placement=True can fix issue when some op cannot be defined on # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(gpu_id) self.sess = tf.Session(config=config) self.rank = hvd.rank() if has_hvd else 0 # Prepare dataset ds = data_type(ob_space, ac_space, self.n_v, use_lstm=self.rnn, hs_len=self.hs_len, distillation=self.distillation, version='v2') self._data_server = DataServer(self._pull_data, rm_size, unroll_length, batch_size, ds, gpu_id_list=(0, ), batch_worker_num=batch_worker_num, pull_worker_num=pull_worker_num, rollout_length=rollout_length, prefetch_buffer_size=2, version=data_server_version, decode=decode, log_infos_interval=log_infos_interval) # prepare net config net_config = policy.net_config_cls(ob_space, ac_space, **policy_config) net_config.clip_range = self.CLIPRANGE if rwd_shape: # make net_config.reward-shaping-weights a tf.placeholder so as to change # it during training. # NOTE: Assume there is reward_weights_shape in net_config # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? reward_weights_shape = net_config.reward_weights_shape self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape) net_config.reward_weights = self.rwd_weights if hasattr(net_config, 'lam'): # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it # during training. # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? self.LAM = tf.placeholder(tf.float32, []) net_config.lam = self.LAM else: self.LAM = None # build the policy net with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope: pass def create_policy(inputs, nc): return policy.net_build_fun(inputs=inputs, nc=nc, scope=model_scope) device = '/gpu:{}'.format(0) with tf.device(device): input_data = self._data_server.input_datas[0] if 'use_xla' in policy_config and policy_config['use_xla']: try: # Use tensorflow's accerlated linear algebra compile method with tf.xla.experimental.jit_scope(True): model = create_policy(input_data, net_config) except: logger.log( "WARNING: using tf.xla requires tf version>=1.15.") model = create_policy(input_data, net_config) else: model = create_policy(input_data, net_config) loss, vf_loss, losses = self.build_loss(model, input_data) if has_hvd: self.losses = [hvd.allreduce(loss) for loss in losses] else: self.losses = list(losses) self.params = tf.trainable_variables(scope='model') self.params_vf = tf.trainable_variables(scope='model/vf') self.param_norm = tf.global_norm(self.params) self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR, beta1=adam_beta1, beta2=adam_beta2, epsilon=adam_eps) self.burn_in_trainer = tf.train.AdamOptimizer( learning_rate=self.LR, epsilon=1e-5) # same as default and IL if use_mixed_precision: try: self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.trainer) self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.burn_in_trainer) except: logger.warn( "using tf mixed_precision requires tf version>=1.15.") if has_hvd: self.trainer = hvd.DistributedOptimizer( self.trainer, sparse_as_dense=use_sparse_as_dense) self.burn_in_trainer = hvd.DistributedOptimizer( self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense) grads_and_vars = self.trainer.compute_gradients(loss, self.params) grads_and_vars_vf = self.burn_in_trainer.compute_gradients( vf_loss, self.params_vf) clip_vars = model.vars.lstm_vars grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars( grads_and_vars, clip_vars, max_grad_norm) grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars( grads_and_vars_vf, clip_vars, max_grad_norm) self._train_batch = self.trainer.apply_gradients(grads_and_vars) self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf) self.loss_endpoints_names = model.loss.loss_endpoints.keys() self._build_ops() if has_hvd: barrier_op = hvd.allreduce(tf.Variable(0.)) broadcast_op = hvd.broadcast_global_variables(0) tf.global_variables_initializer().run(session=self.sess) self.sess.graph.finalize() self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None self.broadcast = lambda: self.sess.run(broadcast_op ) if has_hvd else None self.broadcast() # logging stuff format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank == 0 else ['stdout', 'log', 'tensorboard', 'csv']) logger.configure(dir='training_log/{}rank{}'.format( self._learner_id, self.rank), format_strs=format_strs)
def run_barrier(config): barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) print(tf.Session(config=config).run(barrier)) print("============")
def common_minimize_trainable(self, base_opt, test_opt, name): from tensorflow.python.framework.errors_impl import NotFoundError # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+. try: import horovod.tensorflow as hvd except NotFoundError: self.skipTest( "Skip the test for horovod import error with Tensorflow-2.7.0 on MacOS-12." ) tf.config.set_soft_device_placement(True) hvd.init() base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True) for dtype, run_step, dim in itertools.product([dtypes.float32], [1], [10]): x = tf.random.uniform(shape=[32, dim]) y = tf.zeros([32, 1]) global_step = training_util.create_global_step() base_weight = tf.compat.v1.get_variable(name="base_weights", initializer=tf.ones( [10, 1])) base_logits = tf.nn.relu(math_ops.matmul(x, base_weight)) base_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=base_logits) base_opt_op = base_opt.minimize(base_loss, global_step, var_list=[base_weight]) test_weight = tf.compat.v1.get_variable(name="test_weights", initializer=tf.ones( [10, 1])) test_logits = tf.nn.relu(math_ops.matmul(x, test_weight)) test_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=test_logits) grads_and_vars = test_opt.compute_gradients(test_loss, var_list=[test_weight]) var_list = [] aggregated_grad = [] for grad, var in grads_and_vars: var_list.append(var) aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars, global_step) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: for _ in range(run_step): sess.run(base_opt_op) sess.run(test_opt_op) self.assertAllCloseAccordingToType( sess.run(base_weight), sess.run(test_weight), msg="Cond:{},{},{}".format(dtype, run_step, dim), )
def _make_variable(self, metric, value): with tf.name_scope('MetricAverageCallback'): var = tf.Variable(value, name=metric) allreduce_op = hvd.allreduce(var, device_dense=self.device) return var, allreduce_op
def __init__(self, ports, gpu_id, replay_filelist, batch_size, min_train_sample_num, min_val_sample_num, rm_size, learning_rate, print_interval, checkpoint_interval, num_val_batches, replay_converter_type, policy, policy_config, converter_config=None, policy_config_type=None, model_pool_addrs=None, rollout_length=1, checkpoints_dir=None, restore_checkpoint_path=None, train_generator_worker_num=4, val_generator_worker_num=2, pull_worker_num=2, num_sgd_updates=int(1e30), repeat_training_task=False, unroll_length=32, pub_interval=50, max_clip_grad_norm=1, after_loading_init_scope=None, use_mixed_precision=False, use_sparse_as_dense=False, enable_validation=True, post_process_data=None): assert len(ports) == 2 self.use_hvd = has_hvd and hvd.size() > 1 self.rank = 0 if not self.use_hvd else hvd.rank() self.model_key = 'IL-model' self.pub_interval = pub_interval self.rnn = (False if 'use_lstm' not in policy_config else policy_config['use_lstm']) self.hs_len = None # overwrite it using the batch_size for training policy_config['batch_size'] = batch_size if self.rnn: assert model_pool_addrs is not None self._model_pool_apis = ModelPoolAPIs(model_pool_addrs) self._model_pool_apis.check_server_set_up() policy_config['rollout_len'] = rollout_length # infer hidden state length (size) if 'hs_len' in policy_config: self.hs_len = policy_config['hs_len'] elif 'nlstm' in policy_config: self.hs_len = 2 * policy_config['nlstm'] else: self.hs_len = 128 self.should_push_model = (self.rnn and self.rank == 0) use_gpu = (gpu_id >= 0) converter_config = {} if converter_config is None else converter_config train_replay_filelist, val_replay_filelist = _get_local_replays( replay_filelist) replay_converter = replay_converter_type(**converter_config) ob_space, ac_space = replay_converter.space.spaces if post_process_data is not None: ob_space, ac_space = post_process_data(ob_space, ac_space) self.data_pool = ImDataServer( ports=ports, train_replay_filelist=train_replay_filelist, val_replay_filelist=val_replay_filelist, batch_size=batch_size, min_train_sample_num=min_train_sample_num, min_val_sample_num=min_val_sample_num, ob_space=ob_space, ac_space=ac_space, train_generator_worker_num=train_generator_worker_num, val_generator_worker_num=val_generator_worker_num, pull_worker_num=pull_worker_num, rm_size=rm_size, repeat_training_task=repeat_training_task, unroll_length=unroll_length, rollout_length=rollout_length, lstm=self.rnn, hs_len=self.hs_len, use_gpu=use_gpu) self._enable_validation = enable_validation config = tf.ConfigProto(allow_soft_placement=True) if use_gpu: config.gpu_options.visible_device_list = str(gpu_id) config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) net_config = policy_config_type(ob_space, ac_space, **policy_config) net_config_val = deepcopy(net_config) with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope: pass def create_policy(inputs, nc): return policy(inputs=inputs, nc=nc, scope=model_scope) if hasattr(net_config, 'endpoints_verbosity'): # intentionally disables endpoints during training net_config.endpoints_verbosity = 0 device = '/gpu:0' if use_gpu else '/cpu:0' with tf.device(device): if 'use_xla' in policy_config and policy_config['use_xla']: try: # Use tensorflow's accerlated linear algebra compile method with tf.xla.experimental.jit_scope(True): model = create_policy(self.data_pool.train_batch_input, net_config) except: logger.log( "WARNING: using tf.xla requires tf version>=1.15.") model = create_policy(self.data_pool.train_batch_input, net_config) else: model = create_policy(self.data_pool.train_batch_input, net_config) model_val = create_policy(self.data_pool.val_batch_input, net_config_val) params = tf.trainable_variables(scope='model') param_norm = tf.global_norm(params) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5) if use_mixed_precision: try: optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) except: logger.warn( "using tf mixed_precision requires tf version>=1.15.") if self.use_hvd: optimizer = hvd.DistributedOptimizer( optimizer, sparse_as_dense=use_sparse_as_dense) barrier_op = hvd.allreduce(tf.Variable(0.)) self.barrier = lambda: self._sess.run(barrier_op) train_loss = tf.reduce_mean(model.loss.total_il_loss * self.data_pool.train_batch_weight) val_loss = tf.reduce_mean(model_val.loss.total_il_loss * self.data_pool.val_batch_weight) if hasattr(net_config, 'weight_decay') and not net_config.weight_decay: # None or 0.0 total_loss = train_loss else: total_loss = train_loss + model.loss.total_reg_loss grads_and_vars = optimizer.compute_gradients(total_loss, params) clip_vars = model.vars.lstm_vars clip_grads = [grad for grad, var in grads_and_vars if var in clip_vars] nonclip_grads_and_vars = [(grad, var) for grad, var in grads_and_vars if var not in clip_vars] if max_clip_grad_norm > 0: clip_grads, clip_grad_norm = tf.clip_by_global_norm( clip_grads, max_clip_grad_norm) else: clip_grad_norm = tf.global_norm(clip_grads) clip_grads_and_var = list(zip(clip_grads, clip_vars)) grads_and_vars = clip_grads_and_var + nonclip_grads_and_vars grad_norm = tf.global_norm(list(zip(*grads_and_vars))[0]) train_op = optimizer.apply_gradients(grads_and_vars) tf.global_variables_initializer().run(session=self._sess) self.new_params = [ tf.placeholder(p.dtype, shape=p.get_shape()) for p in params ] self.param_assign_ops = [ p.assign(new_p) for p, new_p in zip(params, self.new_params) ] opt_params = optimizer.variables() self.new_opt_params = [ tf.placeholder(p.dtype, shape=p.get_shape()) for p in opt_params ] self.opt_param_assign_ops = [ p.assign(new_p) for p, new_p in zip(opt_params, self.new_opt_params) ] def read_params(): return self._sess.run(params) def read_opt_params(): return self._sess.run(opt_params) def load_model(np_new_params): self._sess.run( self.param_assign_ops, feed_dict={ p: np_p for p, np_p in zip(self.new_params, np_new_params) }) def restore_optimizer(np_new_opt_params): self._sess.run( self.opt_param_assign_ops, feed_dict={ p: np_p for p, np_p in zip(self.new_opt_params, np_new_opt_params) }) def _train_step(): return self._sess.run([ train_loss_aggregated, *train_other_losses_aggregated, grad_norm, clip_grad_norm, param_norm, train_op ], {})[:-1] def _val_step(): # maximal_feat = [tf.reduce_max(tf.cast(x, tf.float32)) # for x in self.data_pool.val_batch_input.X] # print(self._sess.run(maximal_feat, {})) return self._sess.run([ val_loss_aggregated, *val_other_losses_aggregated, *endpoints_aggregated ], {}) self._saver = ChkptsFromSelf(read_params, load_model, self.model_key) if restore_checkpoint_path is not None: self._saver._restore_model_checkpoint(restore_checkpoint_path) if after_loading_init_scope is not None: var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=after_loading_init_scope) logger.log('perform after loading init for vars') for v in var_list: logger.log(v) tf.variables_initializer(var_list).run(session=self._sess) if self.use_hvd: hvd.broadcast_global_variables(0).run(session=self._sess) _allreduce = lambda x: x if not self.use_hvd else hvd.allreduce(x) train_loss_aggregated = _allreduce(train_loss) train_other_loss_names = model.loss.loss_endpoints.keys() train_other_losses_aggregated = [ _allreduce(tf.reduce_mean(l * self.data_pool.train_batch_weight)) for l in model.loss.loss_endpoints.values() ] val_loss_aggregated = _allreduce(val_loss) val_other_loss_names = model_val.loss.loss_endpoints.keys() val_other_losses_aggregated = [ _allreduce(tf.reduce_mean(l * self.data_pool.val_batch_weight)) for l in model_val.loss.loss_endpoints.values() ] endpoints_names = model_val.endpoints.keys() endpoints_aggregated = [ _allreduce(tf.reduce_mean(l)) for l in model_val.endpoints.values() ] self._sess.graph.finalize() self._total_samples = lambda: [ self.data_pool._num_train_samples, self.data_pool._num_val_samples ] self._train_log_names = (['loss'] + list(train_other_loss_names) + ['grad_norm', 'clip_grad_norm', 'param_norm']) self._val_log_names = (['loss'] + list(val_other_loss_names) + list(endpoints_names)) self._batch_size = batch_size self._train_step = _train_step self._val_step = _val_step self._print_interval = print_interval self._checkpoint_interval = checkpoint_interval self._num_val_batches = num_val_batches self._checkpoints_dir = checkpoints_dir if self.rank == 0 else None self._num_sgd_updates = num_sgd_updates self.load_model = load_model self.restore_optimizer = restore_optimizer self.read_params = read_params self.read_opt_params = read_opt_params format_strs = ['log', 'tensorboard', 'csv'] logger.configure(dir='training_log/rank{}'.format(self.rank), format_strs=['stdout'] + format_strs) with logger.scoped_configure(dir='validation_log/rank{}'.format( self.rank), format_strs=['stderr'] + format_strs): self.val_logger = logger.Logger.CURRENT
def _setup_graph(self): self._placeholder = tf.placeholder(tf.float32, shape=[2], name='to_be_reduced') self._reduced = hvd.allreduce(self._placeholder, average=False)
def get_larc_optimizer(optimizer, loss, global_step, steps_per_epoch, use_horovod): #get learning rate learning_rate = get_learning_rate(optimizer, global_step, steps_per_epoch) #get LARC stuff LARC_mode = get_dict_default(optimizer, "LARC_mode", "clip") LARC_eta = get_dict_default(optimizer, "LARC_eta", 0.002) LARC_epsilon = get_dict_default(optimizer, "LARC_epsilon", 1. / 16000.) #lag gradient_lag = get_dict_default(optimizer, "gradient_lag", 0) #set up optimizers opt_type = get_dict_default(optimizer, "opt_type", "LARC-Adam") #set up optimizers if opt_type == "LARC-Adam": beta1 = get_dict_default(optimizer, "beta1", 0.9) beta2 = get_dict_default(optimizer, "beta2", 0.999) optim = tf.train.AdamOptimizer(learning_rate=learning_rate) # optim = tf.train.experimental.enable_mixed_precision_graph_rewrite(optim) elif opt_type == "LARC-RMSProp": optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate) elif opt_type == "LARC-SGD": momentum = get_dict_default(optimizer, "momentum", 0.) optim = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) else: raise ValueError("Error, optimizer {} unsupported.".format(opt_type)) # instead of using the horovod wrapper, we do the allreduce ourselves below #compute gradients grads_and_vars = optim.compute_gradients(loss) lag_ops = [] for idx, (g, v) in enumerate(grads_and_vars): if g is not None: if gradient_lag > 0: g_lag = tf.Variable(initial_value=tf.zeros(g.shape, g.dtype), trainable=False, name=v.name.replace(":", "_") + '_lag') g_next = g g = g_lag if use_horovod and (hvd.size() > 1): # if we ask for an average, it does a scalar divide, but # we can bake that into the scaling below g = hvd.allreduce(g, average=False) g_scale = 1. / hvd.size() else: g_scale = 1 v_norm = linalg_ops.norm(tensor=v, ord=2) g_norm = linalg_ops.norm(tensor=g, ord=2) larc_local_lr = control_flow_ops.cond( pred=math_ops.logical_and( math_ops.not_equal(v_norm, tf.constant(0.0)), math_ops.not_equal(g_norm, tf.constant(0.0))), true_fn=lambda: (LARC_eta / g_scale) * v_norm / g_norm, false_fn=lambda: LARC_epsilon) if LARC_mode == "scale": effective_lr = larc_local_lr else: # DEBUG #effective_lr = math_ops.minimum(larc_local_lr, 1.0) #we need to see which LR to take and then divide out the LR because otherwise it will be multiplied in #again when we apply the gradients effective_lr = math_ops.minimum(larc_local_lr, learning_rate) / learning_rate # DEBUG # rescale gradients effective_lr *= g_scale #multiply gradients g_scaled = math_ops.scalar_mul(effective_lr, g) grads_and_vars[idx] = (g_scaled, v) if gradient_lag > 0: # once we've computed g_scaled, it's safe to overwrite g_lag with tf.control_dependencies([g_scaled]): lag_ops.append(g_lag.assign(g_next)) #apply gradients, making sure to complete the forward pass first with tf.control_dependencies([loss]): grad_updates = optim.apply_gradients(grads_and_vars, global_step=global_step) if gradient_lag > 0: grad_updates = tf.group([grad_updates] + lag_ops) return grad_updates, learning_rate
def allreduce(backend, value, name, average, prescale_factor, postscale_factor, op, compression): return _eval(backend, hvd.allreduce(tf.constant(value, name=name), average=average, prescale_factor=prescale_factor, postscale_factor=postscale_factor, op=op, compression=compression))
def ppo2_loss(neglogp, oldneglogp, vpred, R, mask=None, adv_normalize=True, clip_range=0.1, sync_statistics=None): """"PPO2 loss. The PPO implementation where the values are computed at the learner's end, i.e., there is no `oldvpred` term. This treatment is suitable for PPO with large Replay Memory where the off-policy effect bulks. It accepts the shape layout (b1, b2, ...) which will be taken as the batch_size dimension. For example, the neglopg * can be (batch_size,) * can be (T, B) where T=rollout_len, B=n_rollout Args: neglogp: neglogp of pi, in shape (b1, b2,...) oldneglogp: neglogp of pi_old, in shape (b1, b2,...) vpred: predicted v, in shape (b1, b2, ...) R: (lambda) return, in shape (b1, b2, ...) mask: action logits mask in 0/1 value, in shape (b1, b2,...), the same shape with `neglogp` and `oldneglogp` adv_normalize: whether to normalize advantage clip_range: clip range, scalar sync_statistics: whether to synchronize statistics across multi GPUs (if any) Returns: pg_loss: the PPO policy loss, a scalar in shape () """ # Note the negative sign; we want ratio = pi / pi_{old} ratio = oldneglogp - neglogp if mask is not None: ratio = mask * ratio ratio = tf.exp(ratio) # make the advantage # Note: DO the stop_gradient stuff AT THE CALLER'S END when necessary # R = tf.stop_gradient(R) # vpred = tf.stop_gradient(vpred) adv = R - vpred # normalize the advantage batch_mean = tf.reduce_mean(adv) batch_mean_square = tf.reduce_mean(tf.square(adv)) if sync_statistics == 'horovod': # https://github.com/tensorpack/tensorpack/blob/07783edb998cec3ec91c4312b39bd754cf9ececa/tensorpack/models/batch_norm.py#L226-L231 import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) adv = adv - batch_mean if adv_normalize: adv = adv / tf.sqrt(batch_mean_square + 1e-8) # the ppo policy gradient loss pg_losses1 = -adv * ratio pg_losses2 = -adv * tf.clip_by_value(ratio, 1.0 - clip_range, 1.0 + clip_range) # pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) pg_loss = tf.reduce_mean( tf.where(tf.greater(adv, 0), tf.maximum(pg_losses1, pg_losses2), pg_losses2)) return pg_loss
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. if self.config.is_true("deterministic_train"): aggregation_method = tf.AggregationMethod.ADD_N else: aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N accum_grad_multiple_num_steps = self.config.int( "accum_grad_multiple_step", 0) grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_norm = self.config.float("gradient_clip_norm", 0.0) grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if self.config.is_true("use_horovod") and self.config.value( "horovod_reduce_type", "") == "grad": import horovod.tensorflow as hvd grads_and_vars = [(hvd.allreduce( grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var) for (grad, var) in grads_and_vars] var_grads = { var: grad for (grad, var) in grads_and_vars if grad is not None } if not var_grads: raise Exception("no single variable to train") if self.config.float("maximize_grad_norm", 0): f = self.config.float("maximize_grad_norm", 0) grad_norm = tf.add_n( [tf.nn.l2_loss(g) for g in var_grads.values()], name="grad_norm_half") * 2.0 loss_ext = grad_norm * (-f) grads_and_vars_ext = self.optimizer.compute_gradients( loss_ext, var_list=list(var_grads.keys()), aggregation_method=aggregation_method) var_grads_ext = { var: grad for (grad, var) in grads_and_vars_ext if grad is not None } grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var) for (grad, var) in grads_and_vars] if accum_grad_multiple_num_steps >= 1: grads_and_vars = [(accum_grad_multiple_step( grad, var, train_step=self.network.global_train_step, num_accum_steps=accum_grad_multiple_num_steps), var) for (grad, var) in grads_and_vars] if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): grads_and_vars = add_scaled_noise_to_gradients( grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var) for grad, var in grads_and_vars] if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grads_and_vars = [ (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var) for grad, var in grads_and_vars ] if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grads_clipped, _ = tf.clip_by_global_norm( [grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) if accum_grad_multiple_num_steps >= 1: apply_grads = tf.cond( tf.equal( tf.mod(self.network.global_train_step, accum_grad_multiple_num_steps), accum_grad_multiple_num_steps - 1), true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars), false_fn=lambda: tf.no_op(), name="apply_grads/accum_grad_multiple_step") else: apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
data = tf.get_variable("data", shape=[batch_size, 224, 224, 3], initializer=tf.random_normal_initializer(), trainable=False) target = tf.get_variable("target", shape=[batch_size, 1], initializer=tf.random_normal_initializer(), trainable=False) target = tf.cast(target, tf.int64) network = Network(batch_size) loss = network.cal_train_loss([data, target]) params = tf.trainable_variables() grads = tf.gradients(loss, params) grads1 = [hvd.allreduce(grad, average=False) for grad in grads] gradvars = list(zip(grads1, params)) opt = tf.train.GradientDescentOptimizer(0.001) train_opt = opt.apply_gradients(gradvars) init = tf.compat.v1.global_variables_initializer() bcast_op = hvd.broadcast_global_variables(0) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=config) as session: session.run(init) session.run(bcast_op) for x in range(num_iters): session.run(train_opt)
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', ema_update='default', sync_statistics=None, internal_update=None): """ A more powerful version of `tf.layers.batch_normalization`. It differs from the offical one in the following aspects: 1. Accepts an alternative ``data_format`` option when ``axis`` is None. For 2D input, this argument will be ignored. 2. Default value for ``momentum`` and ``epsilon`` is different. 3. Default value for ``training`` is automatically obtained from tensorpack's ``TowerContext``. User-provided value can overwrite this behavior. 4. Support the ``ema_update`` option, which covers broader use cases than the standard EMA update. 5. Support the ``sync_statistics`` option, which implements "SyncBN" and is very useful in small-batch models. Args: training (bool): if True, use per-batch statistics to normalize. Otherwise, use stored EMA to normalize. By default, it is equal to `get_current_tower_context().is_training`. This is not a good argument name, but it is what the Tensorflow layer uses. ema_update (str): Only effective when ``training=True``. It has the following options: * "default": same as "collection". Because this is the default behavior in tensorflow. * "skip": do not update EMA. This can be useful when you reuse a batch norm layer in several places but do not want them to all update your EMA. * "collection": Add EMA update ops to collection `tf.GraphKeys.UPDATE_OPS`. The ops in the collection will be run automatically by the callback :class:`RunUpdateOps`, along with your training iterations. This can waste compute if your training iterations do not always depend on the BatchNorm layer. * "internal": EMA is updated inside this layer itself by control dependencies. In common cases, it has similar speed to "collection". But it covers more cases, e.g.: 1. BatchNorm is used inside dynamic control flow. The collection-based update does not support dynamic control flows. 2. BatchNorm layer is sometimes unused (e.g., in GANs you have two networks to train alternatively). Putting all update ops into a single collection will waste a lot of compute. 3. Other part of the model relies on the "updated" EMA. The collection-based method does not update EMA immediately. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None, "nccl", or "horovod". It determines how to compute the "per-batch statistics" when ``training==True``. * None: it uses statistics of the input tensor to normalize during training. This is the standard way BatchNorm was implemented in most frameworks. * "nccl": this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. * "horovod": this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. When not None, each GPU computes its own E[x] and E[x^2], which are then averaged among all GPUs to compute global mean & variance. Therefore each GPU needs to have the same batch size. The synchronization is based on the current variable scope + the name of the layer (`BatchNorm('name', input)`). Therefore, you need to make sure that: 1. The BatchNorm layer on different GPUs needs to have the same name, so that statistics can be synchronized. If names do not match, this layer will hang. 2. A BatchNorm layer cannot be reused within one tower. 3. A BatchNorm layer needs to be executed for the same number of times by all GPUs. If different GPUs execute one BatchNorm layer for different number of times (e.g., if some GPUs do not execute it), this layer may hang. This option is also known as "SyncBN" or "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. When `sync_statistics` is enabled, `ema_update` is set to "internal" automatically. This is to avoid running `UPDATE_OPS`, which requires synchronization. internal_update: deprecated option. Don't use. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: This layer is more flexible than the standard "BatchNorm" layer and provides more features: 1. No matter whether you're doing training or not, you can set the ``training`` argument to use batch statistics or EMA statistics. i.e., you can use batch statistics during inference, or use EMA statistics during training. Using EMA statistics in training is useful when you load a pre-trained BN and don't want to update it. 2. As long as `training=True`, `sync_statistics` and `ema_update` option will take effect. """ # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) # parse shapes data_format = get_data_format(data_format, keras_mode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics assert ema_update in ["default", "collection", "internal", "skip"] if internal_update is not None: log_deprecated("BatchNorm(internal_update=)", "Use ema_update='internal' instead!", "2020-01-01") assert ema_update == 'default', \ "Do not use internal_update and ema_update together! internal_update is deprecated" ema_update = "internal" if internal_update else "collection" if ema_update == "default": ema_update = "collection" # Logic: # 1. EMA update is possible only when we compute batch statistics (training=True) # 2. We know that in training, non-main training tower does not need EMA update # We don't know about what to do in prediction context, so be conservative and do the update. # 3. User can explicit disable update by "skip". do_ema_update = training and \ (ctx.is_main_training_tower or not ctx.is_training) \ and (ema_update != "skip") if axis is None: if ndims == 2: axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 assert axis in [1, 3], axis num_chan = shape[axis] TF_version = get_tf_version_tuple() freeze_bn_backward = not training and ctx.is_training if freeze_bn_backward: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!" if ctx.is_main_training_tower: # only warn in first tower log_once("Some BatchNorm layer uses moving_mean/moving_variance in training.", func='warn') # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. do_sync_bn = (sync_statistics is not None) and training if not do_sync_bn: # Use the builtin layer for anything except for sync-bn coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429 fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" use_fp16 = inputs.dtype == tf.float16 if use_fp16: # non-fused does not support fp16; fused does not support all layouts. # we made our best guess here tf_args['fused'] = True layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # Add EMA variables to the correct collection if ctx.is_main_training_tower: for v in layer.non_trainable_variables: if isinstance(v, tf.Variable): tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) if not do_ema_update: restore_collection(coll_bk) if do_ema_update and ema_update == "internal": # Implement "internal" update. restore_collection(coll_bk) assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': num_dev = ctx.total if num_dev == 1: logger.warn("BatchNorm(sync_statistics='nccl') is used with only one tower!") else: assert six.PY2 or TF_version >= (1, 10), \ "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \ "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360" if TF_version <= (1, 12): try: from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so except Exception: pass else: _validate_and_load_nccl_so() from tensorflow.contrib.nccl.ops import gen_nccl_ops else: from tensorflow.python.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd if hvd.size() == 1: logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!") else: import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !" batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, beta, gamma, epsilon) if do_ema_update: ret = internal_update_bn_ema( xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def main(input_path_train, input_path_validation, downsampling_fact, downsampling_mode, channels, data_format, label_id, blocks, weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, fs_type, optimizer, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz, growth, disable_checkpoints, disable_imsave, tracing, trace_dir, output_sampling, scale_factor): #init horovod nvtx.RangePush("init horovod", 1) comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) nvtx.RangePop() # init horovod #downsampling? recompute image dims image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #parameters per_rank_output = False loss_print_interval = 10 #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=6, #1 intra_op_parallelism_threads=1, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) sess_config.gpu_options.force_gpu_compatible = True #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") trn_data = load_data(input_path_train, True, trn_sz, horovod) val_data = load_data(input_path_validation, False, val_sz, horovod) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("done.") #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Batch normalization: {}".format(batchnorm)) print("Blocks: {}".format(blocks)) print("Growth rate: {}".format(growth)) print("Filter size: {}".format(filter_sz)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Output sampling target: {}".format(output_sampling)) #print optimizer parameters for k, v in optimizer.items(): print("Solver Parameters: {k}: {v}".format(k=k, v=v)) #print("Optimizer type: {}".format(optimizer['opt_type'])) print("Num training samples: {}".format(trn_data.shape[0])) print("Num validation samples: {}".format(val_data.shape[0])) print("Disable checkpoints: {}".format(disable_checkpoints)) print("Disable image save: {}".format(disable_imsave)) print("Downsampling factor: {}".format(downsampling_fact)) print("Downsampling mode: {}".format(downsampling_mode)) #compute epochs and stuff: if fs_type == "local": num_samples = trn_data.shape[0] // comm_local_size else: num_samples = trn_data.shape[0] // comm_size num_steps_per_epoch = num_samples // batch num_steps = num_epochs * num_steps_per_epoch if per_rank_output: print("Rank {} does {} steps per epoch".format(comm_rank, num_steps_per_epoch)) with training_graph.as_default(): nvtx.RangePush("TF Init", 3) #create readers trn_reader = h5_input_reader(input_path_train, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, sample_target=output_sampling) val_reader = h5_input_reader(input_path_validation, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) next_elem = iterator.get_next() #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": #do downsampling rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem = (tf.transpose(next_elem[0], perm=[0, 2, 3, 1]), next_elem[1], next_elem[2], next_elem[3]) #crop next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) #be careful with data order if data_format == "channels_first": next_elem = (tf.transpose(next_elem[0], perm=[0, 3, 1, 2]), next_elem[1], next_elem[2], next_elem[3]) elif downsampling_mode == "random-crop": #some parameters crop_size = [ batch, image_height, image_width, len(channels) + 2 ] #concatenate input, crop, split apart crop_input = tf.concat([next_elem[0] if data_format=="channels_last" else tf.transpose(next_elem[0], perm=[0,2,3,1]), \ ensure_type(tf.expand_dims(next_elem[1], axis=-1), tf.float32), \ tf.expand_dims(next_elem[2], axis=-1)], \ axis = -1) crop_output = tf.image.random_crop(crop_input, crop_size) #restore iterator output crop_image = crop_output[:, :, :, :len(channels)] crop_label = ensure_type(crop_output[:, :, :, len(channels)], tf.int32) crop_weight = crop_output[:, :, :, len(channels) + 1] next_elem = (crop_image if data_format=="channels_last" else tf.transpose(crop_image, perm=[0,3,1,2]), \ crop_label, crop_weight, next_elem[3]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, random-crop, scale]" .format(downsampling_mode)) #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #compute the input filter number based on number of channels used num_channels = len(channels) nb_filter = 64 #set up model logit, prediction = create_tiramisu(3, next_elem[0], image_height, image_width, num_channels, loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype, batchnorm=batchnorm, growth_rate=growth, nb_filter=nb_filter, filter_sz=filter_sz, median_filter=False, data_format=data_format) #prediction_argmax = median_pool(prediction_argmax, 3, strides=[1,1,1,1]) #set up loss loss = None if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) labels_one_hot = ensure_type(labels_one_hot, dtype) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) #determine flops flops = graph_flops.graph_flops( format="NHWC" if data_format == "channels_last" else "NCHW", batch=batch, sess_config=sess_config) flops *= comm_size if comm_rank == 0: print('training flops: {:.3f} TF/step'.format(flops * 1e-12)) #number of trainable parameters if comm_rank == 0: num_params = get_number_of_trainable_parameters() print('number of trainable parameters: {} ({} MB)'.format( num_params, num_params * (4 if dtype == tf.float32 else 2) * (2**-20))) if horovod: loss_avg = hvd.allreduce(ensure_type(loss, tf.float32)) else: loss_avg = tf.identity(loss) #set up global step - keep on CPU with tf.device('/device:CPU:0'): global_step = tf.train.get_or_create_global_step() #set up optimizer if optimizer['opt_type'].startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op, lr = get_larc_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) else: train_op, lr = get_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] #bcast init for bcasting the model after start if horovod: init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = 5 * num_steps_per_epoch checkpoint_saver = tf.train.Saver(max_to_keep=1000) if (not disable_checkpoints): hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) if tracing is not None: import tracehook tracing_hook = tracehook.TraceHook(steps_to_trace=tracing, cache_traces=True, trace_dir=trace_dir) hooks.append(tracing_hook) # instead of averaging losses over an entire epoch, use a moving # window average recent_losses = [] loss_window_size = 10 #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0 and not disable_checkpoints: load_model(sess, checkpoint_saver, checkpoint_dir) #broadcast loaded model variables if horovod: sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) nvtx.RangePop() # TF Init # figure out what step we're on (it won't be 0 if we are # restoring from a checkpoint) so we can count from there train_steps = sess.run([global_step])[0] #do the training epoch = 1 step = 1 t_sustained_start = time.time() nvtx.RangePush("Training Loop", 4) nvtx.RangePush("Epoch", epoch) start_time = time.time() while not sess.should_stop(): #training loop try: nvtx.RangePush("Step", step) #construct feed dict t_inst_start = time.time() _, tmp_loss = sess.run( [train_op, (loss if per_rank_output else loss_avg)], feed_dict={handle: trn_handle}) t_inst_end = time.time() train_steps += 1 train_steps_in_epoch = train_steps % num_steps_per_epoch recent_losses = [tmp_loss ] + recent_losses[0:loss_window_size - 1] train_loss = sum(recent_losses) / len(recent_losses) nvtx.RangePop() # Step step += 1 #print step report eff_steps = train_steps_in_epoch if ( train_steps_in_epoch > 0) else num_steps_per_epoch if (train_steps % loss_print_interval) == 0: if per_rank_output: print( "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}" .format(comm_rank, train_steps, num_steps, train_loss, time.time() - start_time)) else: if comm_rank == 0: print( "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}" .format( train_steps, num_steps, train_loss, time.time() - start_time, 1e-12 * flops / (t_inst_end - t_inst_start))) #do the validation phase if train_steps_in_epoch == 0: end_time = time.time() #print epoch report if per_rank_output: print( "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( comm_rank, epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) else: if comm_rank == 0: print( "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) #evaluation loop eval_loss = 0. eval_steps = 0 nvtx.RangePush("Eval Loop", 7) while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run( [ iou_update_op, (loss if per_rank_output else loss_avg), prediction, next_elem[1], next_elem[3] ], feed_dict={handle: val_handle}) #print some images if comm_rank == 0 and not disable_imsave: if have_imsave: imsave( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', np.argmax( val_model_predictions[0, ...], axis=2) * 100) imsave( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', val_model_labels[0, ...] * 100) imsave( image_dir + '/test_combined_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', colormap[ val_model_labels[0, ...], np.argmax( val_model_predictions[0, ...], axis=2)]) else: np.savez( image_dir + '/test_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npz', prediction=np.argmax( val_model_predictions[0, ...], axis=2) * 100, label=val_model_labels[0, ...] * 100, filename=val_model_filenames[0]) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) eval_loss /= eval_steps if per_rank_output: print( "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, eval_loss)) else: if comm_rank == 0: print( "COMPLETED: evaluation loss for epoch {} (of {}) is {}" .format(epoch, num_epochs, eval_loss)) if per_rank_output: iou_score = sess.run(iou_op) print( "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, iou_score)) else: iou_score = sess.run(iou_avg) if comm_rank == 0: print( "COMPLETED: evaluation IoU for epoch {} (of {}) is {}" .format(epoch, num_epochs, iou_score)) sess.run(iou_reset_op) sess.run(val_init_op, feed_dict={handle: val_handle}) break nvtx.RangePop() # Eval Loop #reset counters epoch += 1 step = 0 t_sustained_start = time.time() nvtx.RangePop() # Epoch nvtx.RangePush("Epoch", epoch) except tf.errors.OutOfRangeError: break nvtx.RangePop() # Epoch nvtx.RangePop() # Training Loop # write any cached traces to disk if tracing is not None: tracing_hook.write_traces()
if __name__ == '__main__': args = get_args() if args.mixed_precision: policy = tf.keras.mixed_precision.Policy("mixed_float16") tf.keras.mixed_precision.set_global_policy(policy) local_rank = os.getenv("OMPI_COMM_WORLD_RANK") os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank) hvd.init() dense_variables, vocabulary_tensors, samples, labels = generate_data(args) sok_loss_list = run_sok_model(args, dense_variables, vocabulary_tensors, samples, labels) # compute_average_loss for i in range(args.iter_num): sok_loss_list[i] = hvd.allreduce(sok_loss_list[i]) if hvd.local_rank() == 0: tf_loss_list = run_tf_model(args, dense_variables, vocabulary_tensors, samples, labels) if hvd.local_rank() == 0: for i in range(args.iter_num): print('Iteration: {}, sok={}, tf={}'.format( i, sok_loss_list[i], tf_loss_list[i]))
def hvd_barrier(): hvd.allreduce(tf.random.normal([1]))
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which cover more use cases than the standard collection-based update. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` is recommended and can be helpful when: 1. BatchNorm is used inside dynamic control flow. The collection-based update does not support dynamic control flows. 2. BatchNorm layer is sometimes unused (e.g., when you have two networks to train alternatively). Putting all update ops into a single collection will waste a lot of compute. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None, "nccl", or "horovod". By default (None), it uses statistics of the input tensor to normalize. This is the standard way BatchNorm was done in most frameworks. When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. If not None, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute global mean & variance. Therefore each GPU needs to have the same batch size. The synchronization is based on the current variable scope + the name of the layer (`BatchNorm('name', input)`). Therefore, you need to make sure that: 1. The BatchNorm layer on different GPUs needs to have the same name, so that statistics can be synchronized. If names do not match, this layer will hang. 2. Different BatchNorm layers in one tower cannot share the same name. 3. A BatchNorm layer needs to be executed for the same number of times by all GPUs. If different GPUs execute one BatchNorm layer for different number of times (e.g., if some GPUs do not execute it), this layer may hang. This option only has effect in standard training mode. This option is also known as "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. When `sync_statistics` is enabled, `internal_update` will be set to True automatically. This is to avoid running `UPDATE_OPS`, which requires synchronization. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, keras_mode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 assert axis in [1, 3], axis num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() freeze_bn_backward = not training and ctx.is_training if freeze_bn_backward: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!" if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429 fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" use_fp16 = inputs.dtype == tf.float16 if use_fp16: # non-fused does not support fp16; fused does not support all layouts. # we made our best guess here tf_args['fused'] = True layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: if isinstance(v, tf.Variable): tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ( [0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': num_dev = ctx.total if num_dev == 1: logger.warn( "BatchNorm(sync_statistics='nccl') is used with only one tower!" ) else: assert six.PY2 or TF_version >= (1, 10), \ "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \ "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360" if TF_version <= (1, 12): try: from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so except Exception: pass else: _validate_and_load_nccl_so() from tensorflow.contrib.nccl.ops import gen_nccl_ops else: from tensorflow.python.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd if hvd.size() == 1: logger.warn( "BatchNorm(sync_statistics='horovod') is used with only one process!" ) else: import horovod hvd_version = tuple(map(int, horovod.__version__.split('.'))) assert hvd_version >= ( 0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !" batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` can be used when you have conditionals in your model, or when you have multiple networks to train. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None, "nccl", or "horovod". By default (None), it uses statistics of the input tensor to normalize. This is the standard way BatchNorm was done in most frameworks. When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute global mean & variance. Therefore each GPU needs to have the same batch size. This option has no effect when not training. This option is also known as "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() freeze_bn_backward = not training and ctx.is_training if freeze_bn_backward: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!" if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429 fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: if isinstance(v, tf.Variable): tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': num_dev = ctx.total if num_dev == 1: logger.warn("BatchNorm(sync_statistics='nccl') is used with only one tower!") else: assert six.PY2 or TF_version >= (1, 10), \ "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \ "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360" from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd if hvd.size() == 1: logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!") else: import horovod hvd_version = tuple(map(int, horovod.__version__.split('.'))) assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !" batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema( xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def _bn_train(self, inputs): """学習時のBN。""" # self.axisを除く軸で平均・分散を算出する target_axes = self._get_target_axes(ndims=inputs.shape.rank) stat_axes = [ a for a in range(inputs.shape.rank) if a not in target_axes ] # 平均・分散の算出 x = tf.cast(inputs, tf.float32) if tf.version.VERSION.startswith("2.2"): # workaround x = tf.debugging.assert_all_finite(x, "x") mean = tf.math.reduce_mean(x, axis=stat_axes) squared_mean = tf.math.reduce_mean(tf.math.square(x), axis=stat_axes) # if tf.version.VERSION.startswith("2.2"): # workaround mean = tf.debugging.assert_all_finite(mean, "mean") squared_mean = tf.debugging.assert_all_finite(squared_mean, "squared_mean") # Sync BN if tk.hvd.initialized(): import horovod.tensorflow as _hvd mean = _hvd.allreduce(mean, op=_hvd.Average) squared_mean = _hvd.allreduce(squared_mean, op=_hvd.Average) else: replica_context = tf.distribute.get_replica_context() if replica_context is not None: mean = replica_context.all_reduce(tf.distribute.ReduceOp.MEAN, mean) squared_mean = replica_context.all_reduce( tf.distribute.ReduceOp.MEAN, squared_mean) else: strategy = tf.distribute.get_strategy() mean = strategy.reduce(tf.distribute.ReduceOp.MEAN, mean, axis=None) squared_mean = strategy.reduce(tf.distribute.ReduceOp.MEAN, squared_mean, axis=None) var = squared_mean - tf.math.square(mean) if tf.version.VERSION.startswith("2.2"): # workaround mean = tf.debugging.assert_all_finite(mean, "reduced mean") var = tf.debugging.assert_all_finite(var, "reduced var") # exponential moving average: # m_new = m_old * 0.99 + x * 0.01 # m_new - m_old = (x - m_old) * 0.01 decay = 1 - self.momentum self.add_update([ self.moving_mean.assign_add( (mean - self.moving_mean) * decay, read_value=False, ), self.moving_variance.assign_add( (var - self.moving_variance) * decay, read_value=False, ), ]) # y = (x - mean) / (sqrt(var) + epsilon) * gamma + beta # = x * gamma / (sqrt(var) + epsilon) + (beta - mean * gamma / (sqrt(var) + epsilon)) # = x * a + (beta - mean * a) a = self.gamma * tf.math.rsqrt(var + self.epsilon) b = self.beta - mean * a return K.cast(x * a + b, K.dtype(inputs))
def main(device, input_path_train, input_path_validation, dummy_data, downsampling_fact, downsampling_mode, channels, data_format, label_id, weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model, decoder, fs_type, optimizer, batch, batchnorm, num_epochs, dtype, disable_checkpoints, disable_imsave, tracing, trace_dir, output_sampling, scale_factor, intra_threads, inter_threads): #init horovod comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) #downsampling? recompute image dims image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #parameters per_rank_output = False loss_print_interval = 1 #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=inter_threads, #6 intra_op_parallelism_threads=intra_threads, #1 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) sess_config.gpu_options.force_gpu_compatible = True #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") train_files = load_data(input_path_train, True, trn_sz, horovod) valid_files = load_data(input_path_validation, False, val_sz, horovod) #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Decoder: {}".format(decoder)) print("Batch normalization: {}".format(batchnorm)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Output sampling target: {}".format(output_sampling)) #print optimizer parameters for k, v in optimizer.items(): print("Solver Parameters: {k}: {v}".format(k=k, v=v)) print("Num training samples: {}".format(train_files.shape[0])) print("Num validation samples: {}".format(valid_files.shape[0])) if dummy_data: print("Using synthetic dummy data") print("Disable checkpoints: {}".format(disable_checkpoints)) print("Disable image save: {}".format(disable_imsave)) #compute epochs and stuff: if fs_type == "local": num_samples = train_files.shape[0] // comm_local_size else: num_samples = train_files.shape[0] // comm_size print("num_samples: {} batch: {}".format(num_samples, batch)) num_steps_per_epoch = num_samples // batch num_steps = num_epochs * num_steps_per_epoch if comm_rank == 0: print("Number of steps per epoch: {}".format(num_steps_per_epoch)) print("Number of steps in total: {}".format(num_steps)) if per_rank_output: print("Rank {} does {} steps per epoch".format(comm_rank, num_steps_per_epoch)) with training_graph.as_default(): if dummy_data: dummy_data_args = dict(batchsize=batch, data_format=data_format, dtype=dtype) trn_dataset = create_dummy_dataset(n_samples=trn_sz, num_epochs=num_epochs, **dummy_data_args) val_dataset = create_dummy_dataset(n_samples=val_sz, num_epochs=1, **dummy_data_args) else: #create readers trn_reader = h5_input_reader(input_path_train, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, sample_target=output_sampling) val_reader = h5_input_reader(input_path_validation, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, train_files, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, valid_files, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, train_files, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, valid_files, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) next_elem = iterator.get_next() #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": #do downsampling rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 2, 3, 1]) #crop next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 3, 1, 2]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, scale]" .format(downsampling_mode)) #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #compute the input filter number based on number of channels used num_channels = len(channels) #set up model model = deeplab_v3_plus_generator(num_classes=3, output_stride=8, base_architecture=model, decoder=decoder, batchnorm=batchnorm, pre_trained_model=None, batch_norm_decay=None, data_format=data_format) logit, prediction = model(next_elem[0], True, dtype) #set up loss loss = None #cast the logits to fp32 logit = ensure_type(logit, tf.float32) if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "weighted_mean": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": #one-hot-encode labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) #cast to FP32 labels_one_hot = ensure_type(labels_one_hot, tf.float32) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) #determine flops flops = graph_flops.graph_flops( format="NHWC" if data_format == "channels_last" else "NCHW", verbose=False, batch=batch, sess_config=sess_config) flops *= comm_size if comm_rank == 0: print('training flops: {:.3f} TF/step'.format(flops * 1e-12)) #number of trainable parameters if comm_rank == 0: num_params = get_number_of_trainable_parameters() print('number of trainable parameters: {} ({} MB)'.format( num_params, num_params * (4 if dtype == tf.float32 else 2) * (2**-20))) if horovod: loss_avg = hvd.allreduce(ensure_type(loss, tf.float32)) else: loss_avg = tf.identity(loss) tmpl = (loss if per_rank_output else loss_avg) #set up global step - keep on CPU with tf.device('/device:CPU:0'): global_step = tf.train.get_or_create_global_step() #set up optimizer if optimizer['opt_type'].startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op, lr = get_larc_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) else: train_op, lr = get_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) if "gpu" in device.lower(): with tf.device(device): mem_usage_ops = [ tf.contrib.memory_stats.MaxBytesInUse(), tf.contrib.memory_stats.BytesLimit() ] #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end #hooks = [tf.train.StopAtStepHook(last_step=3)] #hooks = [tf.train.StopAtStepHook(num_steps=3)] hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] nvtx_callback = NVTXHook(skip_n_steps=0, name='TTTTTrain') hooks.append(nvtx_callback) #bcast init for bcasting the model after start if horovod: init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = 5 * num_steps_per_epoch checkpoint_saver = tf.train.Saver(max_to_keep=1000) if (not disable_checkpoints): hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) #tracing if tracing is not None: import tracehook tracing_hook = tracehook.TraceHook(steps_to_trace=tracing, cache_traces=True, trace_dir=trace_dir) hooks.append(tracing_hook) print("############ tracing enabled") # instead of averaging losses over an entire epoch, use a moving # window average recent_losses = [] loss_window_size = 10 #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0 and not disable_checkpoints: load_model(sess, checkpoint_saver, checkpoint_dir) #broadcast loaded model variables if horovod: sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) # figure out what step we're on (it won't be 0 if we are # restoring from a checkpoint) so we can count from there train_steps = sess.run([global_step])[0] #do the training epoch = 1 step = 1 prev_mem_usage = 0 t_sustained_start = time.time() r_peak = 0 #warmup loops print("### Warmup for 5 steps") start_time = time.time() #while not sess.should_stop(): for _ in range(5): #try: print('warmup train_steps is {}'.format(train_steps)) if train_steps == 5: # if have_pycuda: # pyc.driver.start_profiler() print(train_steps) _ = sess.run([train_op], feed_dict={handle: trn_handle}) #tmp_loss = sess.run([(loss if per_rank_output else loss_avg)],feed_dict={handle: trn_handle}) if train_steps == 5: # if have_pycuda: # pyc.driver.stop_profiler() print(train_steps) train_steps += 1 end_time = time.time() print("### Warmup time: {:0.2f}".format(end_time - start_time)) ### Start profiling print('Begin training loop') #if have_cupy: #cupy.cuda.profiler.start() # if have_pycuda: # pyc.driver.start_profiler() #while not sess.should_stop(): for _ in range(1): try: print('train_steps is {}'.format(train_steps)) if train_steps == 5: if have_pycuda: pyc.driver.start_profiler() print(train_steps) _ = sess.run([tmpl], feed_dict={handle: trn_handle}) # _ = sess.run([train_op],feed_dict={handle: trn_handle}) if train_steps == 5: if have_pycuda: pyc.driver.stop_profiler() print(train_steps) train_steps += 1 except tf.errors.OutOfRangeError: break # if have_pycuda: # pyc.driver.stop_profiler() ### End of profiling #if have_cupy: # cupy.cuda.profiler.stop() # write any cached traces to disk if tracing is not None: tracing_hook.write_traces() print('All done')
def main(): script_start = time.time() hvd_init() mpi_comm = MPI.COMM_WORLD args = parse_args() if hvd.rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: tf.random.set_random_seed(args.seed) np.random.seed(args.seed) cp.random.seed(args.seed) if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \ and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1": args.fp16 = False if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt') # Load converted data and get statistics train_df = pd.read_pickle(args.data + '/train_ratings.pickle') test_df = pd.read_pickle(args.data + '/test_ratings.pickle') nb_users, nb_items = train_df.max() + 1 # Extract train and test feature tensors from dataframe pos_train_users = train_df.iloc[:, 0].values.astype(np.int32) pos_train_items = train_df.iloc[:, 1].values.astype(np.int32) pos_test_users = test_df.iloc[:, 0].values.astype(np.int32) pos_test_items = test_df.iloc[:, 1].values.astype(np.int32) # Negatives indicator for negatives generation neg_mat = np.ones((nb_users, nb_items), dtype=np.bool) neg_mat[pos_train_users, pos_train_items] = 0 # Get the local training/test data train_users, train_items, train_labels = get_local_train_data( pos_train_users, pos_train_items, args.negative_samples) test_users, test_items = get_local_test_data(pos_test_users, pos_test_items) # Create and run Data Generator in a separate thread data_generator = DataGenerator( args.seed, hvd.rank(), nb_users, nb_items, neg_mat, train_users, train_items, train_labels, args.batch_size // hvd.size(), args.negative_samples, test_users, test_items, args.valid_users_per_batch, args.valid_negative, ) # Create tensorflow session and saver config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) # Input tensors users = tf.placeholder(tf.int32, shape=(None, )) items = tf.placeholder(tf.int32, shape=(None, )) labels = tf.placeholder(tf.int32, shape=(None, )) is_dup = tf.placeholder(tf.float32, shape=(None, )) dropout = tf.placeholder_with_default(args.dropout, shape=()) # Model ops and saver hit_rate, ndcg, eval_op, train_op = ncf_model_ops( users, items, labels, is_dup, params={ 'fp16': args.fp16, 'val_batch_size': args.valid_negative + 1, 'top_k': args.topk, 'learning_rate': args.learning_rate, 'beta_1': args.beta1, 'beta_2': args.beta2, 'epsilon': args.eps, 'num_users': nb_users, 'num_items': nb_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': dropout, 'sigmoid': True, 'loss_scale': args.loss_scale }, mode='TRAIN' if args.mode == 'train' else 'EVAL') saver = tf.train.Saver() # Accuracy metric tensors hr_sum = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/total:0') hr_cnt = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/count:0') ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0') ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0') # Prepare evaluation data data_generator.prepare_eval_data() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: # Manual initialize weights sess.run(tf.global_variables_initializer()) # If test mode, run one eval if args.mode == 'test': sess.run(tf.local_variables_initializer()) eval_start = time.time() for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) eval_duration = time.time() - eval_start # Report results hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False)) hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False)) ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False)) ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False)) hit_rate = hit_rate_sum / hit_rate_cnt ndcg = ndcg_sum / ndcg_cnt if hvd.rank() == 0: eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration dllogger.log(step=tuple(), data={ 'eval_throughput': eval_throughput, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) return # Performance Metrics train_times = list() eval_times = list() # Accuracy Metrics first_to_target = None time_to_train = 0.0 best_hr = 0 best_epoch = 0 # Buffers for global metrics global_hr_sum = np.ones(1) global_hr_count = np.ones(1) global_ndcg_sum = np.ones(1) global_ndcg_count = np.ones(1) # Buffers for local metrics local_hr_sum = np.ones(1) local_hr_count = np.ones(1) local_ndcg_sum = np.ones(1) local_ndcg_count = np.ones(1) # Begin training begin_train = time.time() for epoch in range(args.epochs): # Train for one epoch train_start = time.time() data_generator.prepare_train_data() for user_batch, item_batch, label_batch \ in zip(data_generator.train_users_batches, data_generator.train_items_batches, data_generator.train_labels_batches): sess.run(train_op, feed_dict={ users: user_batch.get(), items: item_batch.get(), labels: label_batch.get() }) train_duration = time.time() - train_start # Only log "warm" epochs if epoch >= 1: train_times.append(train_duration) # Evaluate if epoch > args.eval_after: eval_start = time.time() sess.run(tf.local_variables_initializer()) for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) # Compute local metrics local_hr_sum[0] = sess.run(hr_sum) local_hr_count[0] = sess.run(hr_cnt) local_ndcg_sum[0] = sess.run(ndcg_sum) local_ndcg_count[0] = sess.run(ndcg_cnt) # Reduce metrics across all workers mpi_comm.Reduce(local_hr_count, global_hr_count) mpi_comm.Reduce(local_hr_sum, global_hr_sum) mpi_comm.Reduce(local_ndcg_count, global_ndcg_count) mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum) # Calculate metrics hit_rate = global_hr_sum[0] / global_hr_count[0] ndcg = global_ndcg_sum[0] / global_ndcg_count[0] eval_duration = time.time() - eval_start # Only log "warm" epochs if epoch >= 1: eval_times.append(eval_duration) if hvd.rank() == 0: dllogger.log(step=(epoch, ), data={ 'train_time': train_duration, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) # Update summary metrics if hit_rate > args.target and first_to_target is None: first_to_target = epoch time_to_train = time.time() - begin_train if hit_rate > best_hr: best_hr = hit_rate best_epoch = epoch time_to_best = time.time() - begin_train if hit_rate > args.target: saver.save(sess, final_checkpoint_path) # Final Summary if hvd.rank() == 0: train_times = np.array(train_times) train_throughputs = pos_train_users.shape[0] * (args.negative_samples + 1) / train_times eval_times = np.array(eval_times) eval_throughputs = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_times dllogger.log(step=tuple(), data={ 'average_train_time_per_epoch': np.mean(train_times), 'average_train_throughput': np.mean(train_throughputs), 'average_eval_time_per_epoch': np.mean(eval_times), 'average_eval_throughput': np.mean(eval_throughputs), 'first_epoch_to_hit': first_to_target, 'time_to_train': time_to_train, 'time_to_best': time_to_best, 'best_hr': best_hr, 'best_epoch': best_epoch }) dllogger.flush() sess.close() return
def all_reduce_fn(tensor): return hvd.allreduce(tensor, compression=hvd.Compression.fp16)
def hvd_group_all_reduce(ts): import horovod.tensorflow as hvd return [hvd.allreduce(t, average=False) for t in ts]
def train_once( sess, step, ops, names=None, gen_feed_dict_fn=None, deal_results_fn=None, interval_steps=100, eval_ops=None, eval_names=None, gen_eval_feed_dict_fn=None, deal_eval_results_fn=melt.print_results, valid_interval_steps=100, print_time=True, print_avg_loss=True, model_dir=None, log_dir=None, is_start=False, num_steps_per_epoch=None, metric_eval_fn=None, metric_eval_interval_steps=0, summary_excls=None, fixed_step=None, # for epoch only, incase you change batch size eval_loops=1, learning_rate=None, learning_rate_patience=None, learning_rate_decay_factor=None, num_epochs=None, model_path=None, use_horovod=False, ): use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ #is_start = False # force not to evaluate at first step #print('-----------------global_step', sess.run(tf.train.get_or_create_global_step())) timer = gezi.Timer() if print_time: if not hasattr(train_once, 'timer'): train_once.timer = Timer() train_once.eval_timer = Timer() train_once.metric_eval_timer = Timer() melt.set_global('step', step) epoch = (fixed_step or step) / num_steps_per_epoch if num_steps_per_epoch else -1 if not num_epochs: epoch_str = 'epoch:%.3f' % (epoch) if num_steps_per_epoch else '' else: epoch_str = 'epoch:%.3f/%d' % ( epoch, num_epochs) if num_steps_per_epoch else '' melt.set_global('epoch', '%.2f' % (epoch)) info = IO() stop = False if eval_names is None: if names: eval_names = ['eval/' + x for x in names] if names: names = ['train/' + x for x in names] if eval_names: eval_names = ['eval/' + x for x in eval_names] is_eval_step = is_start or valid_interval_steps and step % valid_interval_steps == 0 summary_str = [] eval_str = '' if is_eval_step: # deal with summary if log_dir: if not hasattr(train_once, 'summary_op'): #melt.print_summary_ops() if summary_excls is None: train_once.summary_op = tf.summary.merge_all() else: summary_ops = [] for op in tf.get_collection(tf.GraphKeys.SUMMARIES): for summary_excl in summary_excls: if not summary_excl in op.name: summary_ops.append(op) print('filtered summary_ops:') for op in summary_ops: print(op) train_once.summary_op = tf.summary.merge(summary_ops) #train_once.summary_train_op = tf.summary.merge_all(key=melt.MonitorKeys.TRAIN) train_once.summary_writer = tf.summary.FileWriter( log_dir, sess.graph) tf.contrib.tensorboard.plugins.projector.visualize_embeddings( train_once.summary_writer, projector_config) # if eval ops then should have bee rank 0 if eval_ops: #if deal_eval_results_fn is None and eval_names is not None: # deal_eval_results_fn = lambda x: melt.print_results(x, eval_names) for i in range(eval_loops): eval_feed_dict = {} if gen_eval_feed_dict_fn is None else gen_eval_feed_dict_fn( ) #eval_feed_dict.update(feed_dict) # if use horovod let each rant use same sess.run! if not log_dir or train_once.summary_op is None or gezi.env_has( 'EVAL_NO_SUMMARY') or use_horovod: #if not log_dir or train_once.summary_op is None: eval_results = sess.run(eval_ops, feed_dict=eval_feed_dict) else: eval_results = sess.run(eval_ops + [train_once.summary_op], feed_dict=eval_feed_dict) summary_str = eval_results[-1] eval_results = eval_results[:-1] eval_loss = gezi.get_singles(eval_results) #timer_.print() eval_stop = False if use_horovod: sess.run(hvd.allreduce(tf.constant(0))) #if not use_horovod or hvd.local_rank() == 0: # @TODO user print should also use logging as a must ? #print(gezi.now_time(), epoch_str, 'eval_step: %d'%step, 'eval_metrics:', end='') eval_names_ = melt.adjust_names(eval_loss, eval_names) #if not use_horovod or hvd.rank() == 0: # logging.info2('{} eval_step:{} eval_metrics:{}'.format(epoch_str, step, melt.parse_results(eval_loss, eval_names_))) eval_str = 'valid:{}'.format( melt.parse_results(eval_loss, eval_names_)) # if deal_eval_results_fn is not None: # eval_stop = deal_eval_results_fn(eval_results) assert len(eval_loss) > 0 if eval_stop is True: stop = True eval_names_ = melt.adjust_names(eval_loss, eval_names) if not use_horovod or hvd.rank() == 0: melt.set_global('eval_loss', melt.parse_results(eval_loss, eval_names_)) elif interval_steps != valid_interval_steps: #print() pass metric_evaluate = False # if metric_eval_fn is not None \ # and (is_start \ # or (num_steps_per_epoch and step % num_steps_per_epoch == 0) \ # or (metric_eval_interval_steps \ # and step % metric_eval_interval_steps == 0)): # metric_evaluate = True if metric_eval_fn is not None \ and ((is_start or metric_eval_interval_steps \ and step % metric_eval_interval_steps == 0) or model_path): metric_evaluate = True if 'EVFIRST' in os.environ: if os.environ['EVFIRST'] == '0': if is_start: metric_evaluate = False else: if is_start: metric_evaluate = True if step == 0 or 'QUICK' in os.environ: metric_evaluate = False #print('------------1step', step, 'pre metric_evaluate', metric_evaluate, hvd.rank()) if metric_evaluate: if use_horovod: print('------------metric evaluate step', step, model_path, hvd.rank()) if not model_path or 'model_path' not in inspect.getargspec( metric_eval_fn).args: metric_eval_fn_ = metric_eval_fn else: metric_eval_fn_ = lambda: metric_eval_fn(model_path=model_path) try: l = metric_eval_fn_() if isinstance(l, tuple): num_returns = len(l) if num_returns == 2: evaluate_results, evaluate_names = l evaluate_summaries = None else: assert num_returns == 3, 'retrun 1,2,3 ok 4.. not ok' evaluate_results, evaluate_names, evaluate_summaries = l else: #return dict evaluate_results, evaluate_names = tuple(zip(*dict.items())) evaluate_summaries = None except Exception: logging.info('Do nothing for metric eval fn with exception:\n', traceback.format_exc()) if not use_horovod or hvd.rank() == 0: #logging.info2('{} valid_step:{} {}:{}'.format(epoch_str, step, 'valid_metrics' if model_path is None else 'epoch_valid_metrics', melt.parse_results(evaluate_results, evaluate_names))) logging.info2('{} valid_step:{} {}:{}'.format( epoch_str, step, 'valid_metrics', melt.parse_results(evaluate_results, evaluate_names))) if learning_rate is not None and (learning_rate_patience and learning_rate_patience > 0): assert learning_rate_decay_factor > 0 and learning_rate_decay_factor < 1 valid_loss = evaluate_results[0] if not hasattr(train_once, 'min_valid_loss'): train_once.min_valid_loss = valid_loss train_once.deacy_steps = [] train_once.patience = 0 else: if valid_loss < train_once.min_valid_loss: train_once.min_valid_loss = valid_loss train_once.patience = 0 else: train_once.patience += 1 logging.info2('{} valid_step:{} patience:{}'.format( epoch_str, step, train_once.patience)) if learning_rate_patience and train_once.patience >= learning_rate_patience: lr_op = ops[1] lr = sess.run(lr_op) * learning_rate_decay_factor train_once.deacy_steps.append(step) logging.info2( '{} valid_step:{} learning_rate_decay by *{}, learning_rate_decay_steps={}' .format(epoch_str, step, learning_rate_decay_factor, ','.join(map(str, train_once.deacy_steps)))) sess.run(tf.assign(lr_op, tf.constant(lr, dtype=tf.float32))) train_once.patience = 0 train_once.min_valid_loss = valid_loss if ops is not None: #if deal_results_fn is None and names is not None: # deal_results_fn = lambda x: melt.print_results(x, names) feed_dict = {} if gen_feed_dict_fn is None else gen_feed_dict_fn() # NOTICE ops[2] should be scalar otherwise wrong!! loss should be scalar #print('---------------ops', ops) if eval_ops is not None or not log_dir or not hasattr( train_once, 'summary_op') or train_once.summary_op is None or use_horovod: feed_dict[K.learning_phase()] = 1 results = sess.run(ops, feed_dict=feed_dict) else: ## TODO why below ? #try: feed_dict[K.learning_phase()] = 1 results = sess.run(ops + [train_once.summary_op], feed_dict=feed_dict) summary_str = results[-1] results = results[:-1] # except Exception: # logging.info('sess.run(ops + [train_once.summary_op], feed_dict=feed_dict) fail') # results = sess.run(ops, feed_dict=feed_dict) #print('------------results', results) # #--------trace debug # if step == 210: # run_metadata = tf.RunMetadata() # results = sess.run( # ops, # feed_dict=feed_dict, # options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), # run_metadata=run_metadata) # from tensorflow.python.client import timeline # trace = timeline.Timeline(step_stats=run_metadata.step_stats) # trace_file = open('timeline.ctf.json', 'w') # trace_file.write(trace.generate_chrome_trace_format()) #reults[0] assume to be train_op, results[1] to be learning_rate learning_rate = results[1] results = results[2:] #@TODO should support aver loss and other avg evaluations like test.. if print_avg_loss: if not hasattr(train_once, 'avg_loss'): train_once.avg_loss = AvgScore() #assume results[0] as train_op return, results[1] as loss loss = gezi.get_singles(results) train_once.avg_loss.add(loss) steps_per_second = None instances_per_second = None hours_per_epoch = None #step += 1 #if is_start or interval_steps and step % interval_steps == 0: interval_ok = not use_horovod or hvd.local_rank() == 0 if interval_steps and step % interval_steps == 0 and interval_ok: train_average_loss = train_once.avg_loss.avg_score() if print_time: duration = timer.elapsed() duration_str = 'duration:{:.2f} '.format(duration) melt.set_global('duration', '%.2f' % duration) info.write(duration_str) elapsed = train_once.timer.elapsed() steps_per_second = interval_steps / elapsed batch_size = melt.batch_size() num_gpus = melt.num_gpus() instances_per_second = interval_steps * batch_size / elapsed gpu_info = '' if num_gpus <= 1 else ' gpus:[{}]'.format( num_gpus) if num_steps_per_epoch is None: epoch_time_info = '' else: hours_per_epoch = num_steps_per_epoch / interval_steps * elapsed / 3600 epoch_time_info = '1epoch:[{:.2f}h]'.format( hours_per_epoch) info.write( 'elapsed:[{:.2f}] batch_size:[{}]{} batches/s:[{:.2f}] insts/s:[{:.2f}] {} lr:[{:.6f}]' .format(elapsed, batch_size, gpu_info, steps_per_second, instances_per_second, epoch_time_info, learning_rate)) if print_avg_loss: #info.write('train_avg_metrics:{} '.format(melt.value_name_list_str(train_average_loss, names))) names_ = melt.adjust_names(train_average_loss, names) #info.write('train_avg_metric:{} '.format(melt.parse_results(train_average_loss, names_))) info.write(' train:{} '.format( melt.parse_results(train_average_loss, names_))) #info.write('train_avg_loss: {} '.format(train_average_loss)) info.write(eval_str) #print(gezi.now_time(), epoch_str, 'train_step:%d'%step, info.getvalue(), end=' ') logging.info2('{} {} {}'.format(epoch_str, 'step:%d' % step, info.getvalue())) if deal_results_fn is not None: stop = deal_results_fn(results) summary_strs = gezi.to_list(summary_str) if metric_evaluate: if evaluate_summaries is not None: summary_strs += evaluate_summaries if step > 1: if is_eval_step: # deal with summary if log_dir: summary = tf.Summary() if eval_ops is None: if train_once.summary_op is not None: for summary_str in summary_strs: train_once.summary_writer.add_summary( summary_str, step) else: for summary_str in summary_strs: train_once.summary_writer.add_summary( summary_str, step) suffix = 'valid' if not eval_names else '' # loss/valid melt.add_summarys(summary, eval_results, eval_names_, suffix=suffix) if ops is not None: try: # loss/train_avg melt.add_summarys(summary, train_average_loss, names_, suffix='train_avg') except Exception: pass ##optimizer has done this also melt.add_summary(summary, learning_rate, 'learning_rate') melt.add_summary(summary, melt.batch_size(), 'batch_size', prefix='other') melt.add_summary(summary, melt.epoch(), 'epoch', prefix='other') if steps_per_second: melt.add_summary(summary, steps_per_second, 'steps_per_second', prefix='perf') if instances_per_second: melt.add_summary(summary, instances_per_second, 'instances_per_second', prefix='perf') if hours_per_epoch: melt.add_summary(summary, hours_per_epoch, 'hours_per_epoch', prefix='perf') if metric_evaluate: #melt.add_summarys(summary, evaluate_results, evaluate_names, prefix='eval') prefix = 'step_eval' if model_path: prefix = 'eval' if not hasattr(train_once, 'epoch_step'): train_once.epoch_step = 1 else: train_once.epoch_step += 1 step = train_once.epoch_step # eval/loss eval/auc .. melt.add_summarys(summary, evaluate_results, evaluate_names, prefix=prefix) train_once.summary_writer.add_summary(summary, step) train_once.summary_writer.flush() return stop elif metric_evaluate and log_dir: summary = tf.Summary() for summary_str in summary_strs: train_once.summary_writer.add_summary(summary_str, step) #summary.ParseFromString(evaluate_summaries) summary_writer = train_once.summary_writer prefix = 'step_eval' if model_path: prefix = 'eval' if not hasattr(train_once, 'epoch_step'): ## TODO.. restart will get 1 again.. #epoch_step = tf.Variable(0, trainable=False, name='epoch_step') #epoch_step += 1 #train_once.epoch_step = sess.run(epoch_step) valid_interval_epochs = 1. try: valid_interval_epochs = FLAGS.valid_interval_epochs except Exception: pass train_once.epoch_step = 1 if melt.epoch() <= 1 else int( int(melt.epoch() * 10) / int(valid_interval_epochs * 10)) logging.info('train_once epoch start step is', train_once.epoch_step) else: #epoch_step += 1 train_once.epoch_step += 1 step = train_once.epoch_step #melt.add_summarys(summary, evaluate_results, evaluate_names, prefix='eval') melt.add_summarys(summary, evaluate_results, evaluate_names, prefix=prefix) summary_writer.add_summary(summary, step) summary_writer.flush()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.horovod: hvd.init() if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 hvd_local_rank = 0 config = tf.ConfigProto() learning_rate = FLAGS.learning_rate if FLAGS.horovod: tf.logging.info("Multi-GPU training with TF Horovod") tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * hvd.size( ) * FLAGS.num_accumulation_steps learning_rate = learning_rate * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() hvd_local_rank = hvd.local_rank() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append( LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps)) # Prepare Training Data if FLAGS.do_train: train_examples = read_squad_examples( input_file=FLAGS.train_file, is_training=True, version_2_with_negative=FLAGS.version_2_with_negative) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.local_size()) ] num_examples_per_local_rank = len( train_examples) // hvd.local_size() remainder = len(train_examples) % hvd.local_size() if hvd.local_rank() < remainder: start_index = hvd.local_rank() * (num_examples_per_local_rank + 1) end_index = start_index + num_examples_per_local_rank + 1 else: start_index = hvd.local_rank( ) * num_examples_per_local_rank + remainder end_index = start_index + (num_examples_per_local_rank) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = FeatureWriter(filename=tmp_filenames[hvd_local_rank], is_training=True) convert_examples_to_features( examples=train_examples[start_index:end_index], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, verbose_logging=FLAGS.verbose_logging) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", end_index - start_index) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info(" LR = %f", learning_rate) del train_examples if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: sess.run(barrier) train_input_fn = input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.logging.info("-----------------------------") tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.logging.info("-----------------------------") if FLAGS.export_trtis and master_process: export_model(estimator, FLAGS.output_dir, FLAGS.init_checkpoint) if FLAGS.do_predict and master_process: eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) # Perform evaluation on subset, useful for profiling if FLAGS.num_eval_iterations is not None: eval_examples = eval_examples[:FLAGS.num_eval_iterations * FLAGS.predict_batch_size] eval_writer = FeatureWriter(filename=os.path.join( FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, verbose_logging=FLAGS.verbose_logging) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = input_fn_builder( input_file=eval_writer.filename, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) all_results = [] eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() for result in estimator.predict(predict_input_fn, yield_single_examples=True, hooks=eval_hooks): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time time_list = eval_hooks[-1].time_list time_list.sort() num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.logging.info("-----------------------------") tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.logging.info("Summary Inference Statistics") tf.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.logging.info("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
def train(gamma, double_q, n_step_q, exp_fraction, final_eps, kp_type, colour_input, patch_sizes, lsp_layers, batch_size, num_iters, learning_starts, train_freq, kpt_encoder_type, kpt_cnn_channels, agent_size, learning_rate, max_grad_norm, mask_threshold, tau, window_size, ckpts_prefix, ckpt_load_dir, vis_load, test_every, mp_num_steps, img_size, replay_buffer_size, seed, noise_type, _run): model_init_start = time.time() process_seed = seed + hvd.local_rank() # init Gym environments train_env = make_env(mode="train", seed=process_seed) if hvd.local_rank() == 0: # eval only on 1 node (horovod) eval_env = make_env(mode="eval", seed=20 * (process_seed + 1)) n_actions = train_env.action_space.n # build models vision_model_dict = build_vision_model() agent_model_dict = build_agent_model(n_actions=n_actions, kpt_cnn_channels=kpt_cnn_channels) target_agent_model_dict = build_agent_model( n_actions=n_actions, kpt_cnn_channels=kpt_cnn_channels) # Horovod: adjust learning rate based on number of GPUs. optimizer = get_optimizer(learning_rate=learning_rate * hvd.size()) # setting up ckpts for all the modules query_ckpt, attn_ckpt, pos_enc_ckpt, node_enc_ckpt, \ scene_ckpt, kpt_enc_ckpt = None, None, None, None, None, None policy_ckpt = tf.train.Checkpoint(optimizer=optimizer, model=agent_model_dict["agent_net"]) kpt_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer, model=agent_model_dict["kpt_encoder"]) if kpt_encoder_type == "gnn": node_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer, model=agent_model_dict["node_enc"]) pos_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer, model=agent_model_dict["pos_net"]) # load pre-trained vision module vision_model_dict = load_vision_model(vision_model_dict, kp_type, colour_input, batch_size, lsp_layers, patch_sizes, ckpt_load_dir, vis_load) if hvd.local_rank() == 0: print("initializing models and env took %4.5f s" % (time.time() - model_init_start)) def train_step(inputs): # Minimize the TD error on a batch sampled from replay buffer. with tf.GradientTape() as tape: step_loss, extra = q_learning( vision_model_dict, agent_model_dict, target_agent_model_dict, inputs, batch_size, kp_type, agent_size, mask_threshold, patch_sizes, kpt_encoder_type, mp_num_steps, img_size, lsp_layers, window_size, gamma, double_q, n_step_q) w_update_start = time.time() # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) # collecting trainable params of all modules params = [] for agent_model in agent_model_dict.values(): params = params + list(agent_model.trainable_variables) # compute grads grads = tape.gradient(step_loss, params) # apply grad clipping grads, global_norm = tf.clip_by_global_norm(grads, clip_norm=max_grad_norm) # update agent optimizer.apply_gradients(zip(grads, params)) # print("grad comp + weight updates take %4.5f" % (time.time() - w_update_start)) return step_loss, extra # load weights using var assignment source_vars, target_vars = update_target_networks(agent_model_dict, target_agent_model_dict, tau) # init replay buffer data_spec = (specs.TensorSpec([84, 84, 3], tf.int32, 'obs_tm1'), specs.TensorSpec([1], tf.int32, 'a_tm1'), specs.TensorSpec([1], tf.float32, 'r_tm1'), specs.TensorSpec([2], tf.float32, 'begin_end')) # each process has it's own smaller reply_buffer replay_buffer = EpisodicReplayBuffer( capacity=int(replay_buffer_size), buffer_size=8, dataset_drop_remainder=False, data_spec=data_spec, begin_episode_fn=lambda x: bool(x[3][0, 0]), end_episode_fn=lambda x: bool(x[3][0, 1])) # create tf.Dataset object from replay_buffer and sample rb_ds = replay_buffer.as_dataset(sample_batch_size=batch_size, num_steps=window_size + n_step_q + 1) # dataset iterator sampling trajectories from replay_buffer episode_ids = replay_buffer.create_episode_ids(1) rb_ds = rb_ds.prefetch(buffer_size=AUTOTUNE) rb_iterator = iter(rb_ds) episode_rewards = [0.0] obs = train_env.reset() reset = False # lists for logging exp results eps = 0.1 episode_timestep = 0 exploration = exploration_policy(num_iters, exp_fraction, final_eps) avg_td_error = 0.0 # init lstm_agent state c_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False) h_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False) best_eval_score = -float("inf") # TRAINING LOOP for t in range(int(num_iters)): # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if t == 0: hvd.broadcast_variables(source_vars, root_rank=0) hvd.broadcast_variables(target_vars, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) online_step_start = time.time() # convert obs to float and scale to 0-1 obs_float = np.asarray(obs[None, :, :, :], dtype=np.float32) / 255.0 # sometimes add distractors if noise_type is not "none": obs_float = add_noise(obs_float[0, :, :, :], noise_type) obs_float = obs_float[None, :, :, :] # exploration update_eps = tf.constant(exploration.value(t)) # compute forward pass of input obs over vision + attention modules bottom_up_masks, encoder_features, kpt_centers = vision_forward_pass( obs_float, vision_model_dict, lsp_layers, kp_type, patch_sizes, img_size) # compute keypoint encodings bottom_up_features = encode_keypoints( bottom_up_masks, encoder_features, kpt_centers, mask_threshold, kp_type, kpt_encoder_type, mp_num_steps, q_learn=False, pos_net=agent_model_dict.get("pos_net"), node_encoder=agent_model_dict.get("node_enc"), kpt_encoder=agent_model_dict.get( "kpt_encoder")) # passes None if not available # agent step action, h_t, c_t = agent_model_dict["agent_net"].step( bottom_up_features, [h_tm1, c_tm1], update_eps, training=True, stochastic=True) # env step new_obs, rew, done, _ = train_env.step(action) episode_timestep = episode_timestep + 1 episode_rewards[-1] += rew # store transitions in replay buffer store_exp_start = time.time() # making data_tuple compatible for add_batch() method obs = img_as_ubyte(np.array(obs_float[0, :, :, :], dtype=float)) action = np.array(action, dtype=np.int32) rew = np.array(rew, ndmin=1, dtype=np.float32) end = np.array(done, ndmin=1, dtype=np.float32) begin = np.array(reset, ndmin=1, dtype=np.float32) begin_end = np.concatenate((begin, end), axis=0) # converting from values = (obs, action, rew, begin_end) values_batched = tf.nest.map_structure(lambda b: tf.stack([b]), values) # add batch of transitions of episode_ids to replay_buffer episode_ids = replay_buffer.add_batch(values_batched, episode_ids) obs = new_obs h_tm1 = h_t c_tm1 = c_t reset = False # episode termination if done: # saving cummulative returns at end of episode print("Episode Return: %3.3f" % (episode_rewards[-1])) print(episode_ids.numpy(), update_eps.numpy()) obs = train_env.reset() episode_timestep = 0 # reset lstm state at episode end c_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False) h_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False) episode_rewards.append(0.0) reset = True # Q_LEARNING UPDATES BEGIN if t > learning_starts and t % train_freq == 0: batch_q_start = time.time() # sample a batch of trajectories from replay_buffer for recurrent-dqn inputs = next(rb_iterator) step_loss, extra = train_step(inputs) step_loss = hvd.allreduce(step_loss) # soft-update target networks update_start = time.time() source_vars, target_vars = update_target_networks( agent_model_dict, target_agent_model_dict, tau) # print("Target network updates take %4.5f" % (time.time() - update_start)) td_error = tf.reduce_mean(hvd.allreduce(extra.td_error), axis=0) if hvd.local_rank() == 0: print( "Iteration: %5d Step loss: %4.4f, TD_error: %3.4f took %4.5f s" % (t, step_loss, td_error, time.time() - batch_q_start)) # logging step losses to sacred add_sacred_log("train.t", int((t - learning_starts) / train_freq), _run) add_sacred_log("train.step_loss", float(step_loss), _run) add_sacred_log("train.step_td_error", float(td_error), _run) avg_td_error = avg_td_error + np.abs(td_error) # VALIDATION/CKPT if t > learning_starts and t % test_every == 0: # trigger evaluation run on only 1 node if hvd.local_rank() == 0: eval_start = time.time() mean_ep_rew, var_ep_rew, _, _ = eval_step( eval_env, vision_model_dict, agent_model_dict) avg_td_error = avg_td_error / float( (t - learning_starts) / train_freq) print( "Evaluation after: %5d steps avg_ep_return: %4.5f running_avg_td_error: %4.5f took %4.5f s" % (t, mean_ep_rew, avg_td_error, time.time() - eval_start)) # logging avg. episodic rewards to sacred add_sacred_log("test.t", int( (t - learning_starts) / train_freq), _run) add_sacred_log("test.mean_ep_return", float(mean_ep_rew), _run) add_sacred_log("test.var_ep_return", float(var_ep_rew), _run) add_sacred_log("test.avg_td_error", float(avg_td_error), _run) avg_td_error = 0.0 # ckpt model based on eval-run scores if mean_ep_rew > 0.95 * best_eval_score: best_eval_score = mean_ep_rew # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. policy_ckpt.save(ckpts_prefix + '_agent_net') kpt_enc_ckpt.save(ckpts_prefix + '_kpt_encoder') if kpt_encoder_type == "gnn": node_enc_ckpt.save(ckpts_prefix + '_node_enc') pos_enc_ckpt.save(ckpts_prefix + '_pos_net') if hvd.local_rank() == 0: print("Training complete!!!")
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` can be used when you have conditionals in your model, or when you have multiple networks to train. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None "nccl", or "horovod". By default (None), it uses statistics of the input tensor to normalize. This is the standard way BatchNorm was done in most frameworks. When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute global mean & variance. Therefore each GPU needs to have the same batch size. This option has no effect when not training. This option is also known as "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() if not training and ctx.is_training: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): tf_args = dict(axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=(ndims == 4 and axis in [1, 3]), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ( [0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= (1, 9) and ctx.is_main_training_tower: logger.warn( "A TensorFlow bug will cause cross-GPU BatchNorm to fail. " "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360" ) from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total if num_dev == 1: logger.warn( "BatchNorm(sync_statistics='nccl') is used with only one tower!" ) else: batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def set_model(self, model): self.model = model self.graph = tf.Graph() with self.graph.as_default(): #Horovod added: Normal workflow config1 = tf.ConfigProto(log_device_placement=False) config1.gpu_options.allow_growth = True config1.gpu_options.visible_device_list = str(hvd.local_rank()) self.sess = tf.Session(config=config1) #Horovod end with self.sess.as_default(): initializer = tf.contrib.layers.xavier_initializer( uniform=True) with tf.variable_scope("model", reuse=None, initializer=initializer): self.trainModel = self.model(config=self) #Horovod added: Vary the learning rate, dist optimizer if self.optimizer != None: pass elif self.opt_method == "Adagrad" or self.opt_method == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.alpha * hvd.size(), initial_accumulator_value=1e-20) elif self.opt_method == "Adadelta" or self.opt_method == "adadelta": self.optimizer = tf.train.AdadeltaOptimizer( self.alpha * hvd.size()) elif self.opt_method == "Adam" or self.opt_method == "adam": self.optimizer = tf.train.AdamOptimizer(self.alpha * hvd.size()) else: self.optimizer = tf.train.GradientDescentOptimizer( self.alpha * hvd.size() * self.sync_after) ################################################################ # Fetch a list of our network's trainable parameters. self.trainable_vars = tf.trainable_variables() #print("Shape of trainable vars: {}".format(np.array(self.trainable_vars))) # Create variables to store accumulated gradients self.accumulators = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] #print("Shape of accumulators: {}".format(np.array(self.accumulators))) # Create a variable for counting the number of accumulations self.accumulation_counter = tf.Variable(0.0, trainable=False) # Compute gradients; grad_pairs contains (gradient, variable) pairs self.grad_pairs = self.optimizer.compute_gradients( self.trainModel.loss, self.trainable_vars) # print("Shape of grad_pairs: {}".format(np.array(self.grad_pairs))) # for g, v in self.grad_pairs: # print("Shape of grad: {}".format(np.array(g))) # Create operations which add a variable's gradient to its accumulator. self.accumulate_ops = [ accumulator.assign_add(grad) for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs ) #if grad is not None ] # The final accumulation operation is to increment the counter self.accumulate_ops.append( self.accumulation_counter.assign_add(1.0)) # Update trainable variables by applying the accumulated gradients # divided by the counter. Note: apply_gradients takes in a list of # (grad, var) pairs # self.apply_step = self.optimizer.apply_gradients( # [(accumulator / self.accumulation_counter, var) \ # for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs)] # ) # Accumulators must be zeroed once the accumulated gradient is applied. self.zero_ops = [ accumulator.assign(tf.zeros_like(tv)) for (accumulator, tv) in zip(self.accumulators, self.trainable_vars) ] # Add one last op for zeroing the counter self.zero_ops.append(self.accumulation_counter.assign(0.0)) ################################################################/////////// # self.dist_optimizer = hvd.DistributedOptimizer(self.optimizer) # self.train_op = self.dist_optimizer.minimize(self.trainModel.loss) #Horovod end self.barrier = hvd.allreduce(tf.random_normal(shape=[1])) if (hvd.rank() == 0): self.saver = tf.train.Saver() # self.logSummary = tf.summary.scalar('Train_loss', self.trainModel.loss) # self.train_writer = tf.summary.FileWriter('./train', self.sess.graph) self.sess.run(tf.global_variables_initializer()) #Horovod added: Normal workflow self.sess.run(hvd.broadcast_global_variables(0))
def allreduce(backend, value, name, average): return _eval(backend, hvd.allreduce(tf.constant(value, name=name), average=average))
def train_step(self, batch_h, batch_t, batch_r, batch_y, counter): self.sess.run(self.zero_ops) allreduce_loss = 0.0 for i in range(self.sync_after): feed_dict = { self.trainModel.batch_h: np.append( batch_h[i * self.batch_size:(i + 1) * self.batch_size], batch_h[self.allreduce_batch_size + i * self.batch_size:self.allreduce_batch_size + (i + 1) * self.batch_size]), self.trainModel.batch_t: np.append( batch_t[i * self.batch_size:(i + 1) * self.batch_size], batch_t[self.allreduce_batch_size + i * self.batch_size:self.allreduce_batch_size + (i + 1) * self.batch_size]), self.trainModel.batch_r: np.append( batch_r[i * self.batch_size:(i + 1) * self.batch_size], batch_r[self.allreduce_batch_size + i * self.batch_size:self.allreduce_batch_size + (i + 1) * self.batch_size]), self.trainModel.batch_y: np.append( batch_y[i * self.batch_size:(i + 1) * self.batch_size], batch_y[self.allreduce_batch_size + i * self.batch_size:self.allreduce_batch_size + (i + 1) * self.batch_size]) } _, c = self.sess.run([self.accumulate_ops, self.trainModel.loss], feed_dict=feed_dict) allreduce_loss += c self.track_loss.append((counter, allreduce_loss)) self.sess.run(self.barrier) st1 = time.time() if hvd.size() > 1: averaged_gradients = [] with tf.name_scope("Allreduce"): for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs): #if tf.equal(accumulator, 0) if accumulator is not None: avg_grad = hvd.allreduce(accumulator / self.accumulation_counter) averaged_gradients.append((avg_grad, var)) else: averaged_gradients.append((None, var)) else: averaged_gradients = [] with tf.name_scope("Allreduce"): for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs): #print("Shape of accumulator: {}".format(np.array(accumulator))) if accumulator is not None: avg_grad = accumulator / self.accumulation_counter averaged_gradients.append((avg_grad, var)) else: averaged_gradients.append((None, var)) if (counter % 200 == 0): print('Averaging gradients for 200 batches took: {} secs'.format( time.time() - st1)) st2 = time.time() self.sess.run(self.optimizer.apply_gradients(averaged_gradients)) if (counter % 200 == 0): print('Applying gradients for 200 batches took: {} secs'.format( time.time() - st2)) return allreduce_loss