def train_step(inputs, labels, first_batch, epoch): with tf.GradientTape() as tape: predictions = model(inputs, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['loc'], losses['landm'], losses['class'] = \ multi_box_loss(labels, predictions) total_loss = tf.add_n([l for l in losses.values()]) if cfg['distributed']: # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) if cfg['distributed'] and first_batch and epoch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return total_loss, losses
def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(mnist_model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) # tf.summary.scalar("loss", loss_value, step=step) return loss_value
def train_step(data, model, loss_fn, optimizer, first_batch, compress=True): batch, target = data with tf.GradientTape() as tape: output = model(batch, training=True) loss = loss_fn(target, output) compression = (hvd.Compression.fp16 if compress else hvd.Compression.none) # Horovod: add Horovod Distributed training tape = hvd.DistributedGradientTape(tape, compression=compression) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return loss, output
def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape, tf.GradientTape() as emb_tape: logit = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape tape = hvd.DistributedGradientTape(tape) # There is no need to wrap the emb_tape because the communication is done by sok # emb_tape = hvd.DistributedGradientTape(emb_tape) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) # type(emb_tape) here is hvd.DistributedGradientTape # type(tape) here is tf.GradientTape emb_grads = emb_tape.gradient(replica_loss, emb_variable) grads = tape.gradient(replica_loss, other_variable) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized. # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside # sok is model parallel and the communication is down by sok itself. if first_batch: hvd.broadcast_variables(other_variable, root_rank=0) hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0) return replica_loss
def train_step(model, opt, loss_func, images, labels, first_batch, fp32=False): with tf.GradientTape() as tape: probs = model(images, training=True) loss_value = loss_func(labels, probs) loss_value += tf.add_n(model.losses) if not fp32: scaled_loss_value = opt.get_scaled_loss(loss_value) tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16) if not fp32: grads = tape.gradient(scaled_loss_value, model.trainable_variables) grads = opt.get_unscaled_gradients(grads) else: grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) top_1_pred = tf.squeeze(tf.math.top_k(probs, k=1)[1]) sparse_labels = tf.cast(tf.math.argmax(labels, axis=1), tf.int32) top_1_accuracy = tf.math.reduce_sum( tf.cast(tf.equal(top_1_pred, sparse_labels), tf.int32)) return loss_value, top_1_accuracy
def train_step(self, data): """Perform a single training step.""" x, beta = data start = time.time() with tf.GradientTape() as tape: states, data = self((x, beta), training=True) # states, accept_prob, sumlogdet = self((x, beta), training=True) loss = self.calc_losses(states, data.accept_prob) if self.aux_weight > 0: z = tf.random.normal(x.shape, dtype=x.dtype) states_, accept_prob_, _ = self((z, beta), training=True) loss_ = self.calc_losses(states_, accept_prob_) loss += loss_ if NUM_RANKS > 1 and HAS_HOROVOD: tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) metrics = AttrDict({ 'dt': time.time() - start, 'loss': loss, 'accept_prob': data.accept_prob, # 'eps': self.eps, 'beta': states.init.beta, 'sumlogdet': data.sumlogdet, # 'sumlogdet': data.sumlogdet.out, }) # if self.optimizer.iterations == 0 and NUM_RANKS > 1 and HAS_HOROVOD: if HAS_HOROVOD and NUM_RANKS > 1 and self.optimizer.iterations == 0: hvd.broadcast_variables(self.variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) return states.out.x, metrics
def train_one_step(config, model, optimizer, features, init=False, clip_norm=1.0): with tf.GradientTape() as tape: total_loss, eval_fn_inputs = model(features, is_training=True) unscaled_loss = tf.stop_gradient(total_loss) if config.amp: total_loss = optimizer.get_scaled_loss(total_loss) tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True) gradients = tape.gradient(total_loss, model.trainable_variables) if config.amp: gradients = optimizer.get_unscaled_gradients(gradients) (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) if init: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return unscaled_loss, eval_fn_inputs
def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape, tf.GradientTape() as emb_tape: logit = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = embedding_optimizer.get_scaled_loss(replica_loss) else: _loss = replica_loss tape = hvd.DistributedGradientTape(tape) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) emb_grads = emb_tape.gradient(_loss, emb_variable) grads = tape.gradient(_loss, other_variable) if args.mixed_precision: emb_grads = embedding_optimizer.get_unscaled_gradients(emb_grads) grads = embedding_optimizer.get_unscaled_gradients(grads) if 'plugin' not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) # Note: broadcast should be done after the first gradient step to ensure optimizer initialization. if first_batch: hvd.broadcast_variables(other_variable, root_rank=0) hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0) return replica_loss
def train_one_step(model, opt, x, y, step, EPOCH): with tf.GradientTape() as tape: logits = model(x) loss = compute_loss(y, logits) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, device_sparse='/cpu:0', device_dense='/cpu:0', compression=compression) grads = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if step + EPOCH == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) pred = tf.argmax(logits, axis=-1) compute_miou(y, pred) compute_accuracy(y, pred) return loss
def train_step(inputs, first_batch): images, labels = inputs with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_func(labels, predictions) loss += tf.reduce_sum(model.losses) loss_copy = loss # Scale the losses if precision == 'fp16': loss = loss * tf.cast(loss_scale, loss.dtype) tape = hvd.DistributedGradientTape(tape) old_grads = tape.gradient(loss, model.trainable_variables) # Unscale the grads if precision == 'fp16': loss_scale_reciprocal = 1. / loss_scale grads = [ g * tf.cast(loss_scale_reciprocal, g.dtype) if g is not None else None for g in old_grads ] else: grads = old_grads opt.apply_gradients(zip(grads, model.trainable_variables)) train_top1.update_state(labels, predictions) train_top5.update_state(labels, predictions) if hvd.size() > 1 and first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_copy
def benchmark_step(first_batch): # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: use DistributedGradientTape with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.categorical_crossentropy(target, probs) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, compression=compression) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0)
def distributed_train_step(self, example: tf.train.Example) -> dict: # Unpack data image, label = example["image"], example["label"] with tf.GradientTape() as tape: tape = hvd.DistributedGradientTape(tape) # Calculate prediction pred = self(image) # Calculate loss loss = self.loss(label, pred) # Compute gradients gradients = tape.gradient(loss, self.trainable_variables) # Update weights self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) # Update metrics self.train_loss_metric(loss) self.train_top_1_metric(label, pred) self.train_top_5_metric(label, pred) return { "loss": self.train_loss_metric.result(), "accuracy": self.train_top_1_metric.result(), "top 5": self.train_top_5_metric.result() }
def training_step(images, labels, first_batch): # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none with tf.GradientTape() as tape: probs = mnist_model(images, training=True) import sys print("labels:", labels, file=sys.stderr) print("probs:", probs, file=sys.stderr) loss_value = loss(labels, probs) if args.use_amp: loss_value = opt.get_scaled_loss(loss_value) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, compression=compression) grads = tape.gradient(loss_value, mnist_model.trainable_variables) if args.use_amp: grads = opt.get_unscaled_gradients(grads) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(mnist_model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_value
def main(): ''' simple starter program for tensorflow models. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c','--config',dest='config_filename',help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG,default=DEFAULT_CONFIG) parser.add_argument('--interop',type=int,help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP,default=DEFAULT_INTEROP) parser.add_argument('--intraop',type=int,help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP,default=DEFAULT_INTRAOP) parser.add_argument('-l','--logdir',default=DEFAULT_LOGDIR,help='define location to save log information [default: %s]' % DEFAULT_LOGDIR) parser.add_argument('--horovod', default=False, action='store_true', help="Use MPI with horovod") parser.add_argument('--profiler',default=False, action='store_true', help='Use TF profiler, needs CUPTI in LD_LIBRARY_PATH for Cuda') parser.add_argument('--profrank',default=0,type=int,help='set which rank to profile') parser.add_argument('--batch-term',dest='batch_term',type=int,help='if set, terminates training after the specified number of batches',default=0) parser.add_argument('--evaluate',help='evaluate a pre-trained model file on the test data set only.') parser.add_argument('--train-more',dest='train_more',help='load a pre-trained model file and continue training.') parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename',dest='logfilename',default=None,help='if set, logging information will go to file') args = parser.parse_args() hvd = None rank = 0 nranks = 1 logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' logging_level = logging.INFO if args.horovod: print('importing horovod') sys.stdout.flush() sys.stderr.flush() import horovod import horovod.tensorflow as hvd hvd.init() logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + ( '%05d' % hvd.rank()) + ':%(name)s:%(message)s' rank = hvd.rank() nranks = hvd.size() if rank > 0: logging_level = logging.WARNING # Setup Logging if args.debug and not args.error and not args.warning: logging_level = logging.DEBUG os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' elif not args.debug and args.error and not args.warning: logging_level = logging.ERROR elif not args.debug and not args.error and args.warning: logging_level = logging.WARNING logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) if hvd: logging.warning('host: %s rank: %5d size: %5d local rank: %5d local size: %5d', socket.gethostname(),hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()) tf.config.threading.set_inter_op_parallelism_threads(args.interop) tf.config.threading.set_intra_op_parallelism_threads(args.intraop) # Setup GPUs gpus = tf.config.list_physical_devices('GPU') logger.info( 'number of gpus: %s',len(gpus)) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if hvd and len(gpus) > 0: tf.config.set_visible_devices(gpus[hvd.local_rank() % len(gpus)],'GPU') logging.info( 'using tensorflow version: %s (%s)',tf.__version__,tf.__git_version__) logging.info( 'using tensorflow from: %s',tf.__file__) if hvd: logging.info('using horovod version: %s',horovod.__version__) logging.info('using horovod from: %s',horovod.__file__) logging.info( 'logdir: %s',args.logdir) logging.info( 'interop: %s',args.interop) logging.info( 'intraop: %s',args.intraop) # this must be created after the config settings gtape = tf.GradientTape() if args.horovod: gtape = hvd.DistributedGradientTape(gtape) config = json.load(open(args.config_filename)) # config['device'] = device_str config['profrank'] = args.profrank config['profiler'] = args.profiler config['logdir'] = args.logdir config['rank'] = rank config['nranks'] = nranks config['evaluate'] = False config['batch_term'] = args.batch_term if args.batch_term > 0: config['training']['epochs'] = 1 config['training']['status'] = 1 if args.batch_term < config['training']['status'] else config['training']['status'] if args.evaluate is not None: config['evaluate'] = True config['model_file'] = args.evaluate config['training']['epochs'] = 1 logger.info('evaluating model file: %s',args.evaluate) elif args.train_more is not None: config['train_more'] = True config['model_file'] = args.train_more logger.info('continuing model file: %s',args.train_more) # using mixed precision? if isinstance(config['model']['mixed_precision'],str): logger.info('using mixed precsion: %s',config['model']['mixed_precision']) tf.keras.mixed_precision.set_global_policy(config['model']['mixed_precision']) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') logger.info('%s = \n %s',args.config_filename,json.dumps(config,indent=4,sort_keys=True)) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') config['hvd'] = hvd sys.stdout.flush() sys.stderr.flush() trainds,testds = data_handler.get_datasets(config) logger.info('get model') net = model.get_model(config) loss_func = losses.get_loss(config) opt = get_optimizer(config) if isinstance(config['model']['mixed_precision'],str): opt = tf.keras.mixed_precision.LossScaleOptimizer(opt) # initialize and create the model # input_shape = [config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']] # output = net(tf.random.uniform(input_shape)) # load previous model weights if args.evaluate: net.load_weights(args.evaluate) elif args.train_more: net.load_weights(args.train_more) # # synchronize models across ranks # if hvd: # hvd.broadcast_variables(net.variables, root_rank=0) # hvd.broadcast_variables(opt.variables(), root_rank=0) train_summary_writer = None test_summary_writer = None test_jet_writer = None test_ele_writer = None test_bkg_writer = None test_mean_writer = None if rank == 0: train_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'train') test_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'test') test_jet_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'jet_iou') test_ele_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'ele_iou') test_bkg_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'bkg_iou') test_mean_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'mean_iou') #tf.keras.utils.plot_model(net, "network_model.png", show_shapes=True) #with train_summary_writer.as_default(): #tf.summary.graph(train_step.get_concrete_function().graph) batches_per_epoch = 0 train_mIoU_sum = 0. test_mIoU_sum = 0. for epoch_num in range(config['training']['epochs']): logger.info('begin epoch %s',epoch_num) if not config['evaluate']: train_output = epoch_loop.one_train_epoch(config,trainds,net, loss_func,opt,epoch_num, train_summary_writer, batches_per_epoch, gtape) batches_per_epoch = train_output['batches_per_epoch'] train_mIoU_sum += train_output['mIoU'] logger.info('train mIoU sum: %10.4f',train_mIoU_sum / (epoch_num + 1)) test_output = epoch_loop.one_eval_epoch(config,testds,net, loss_func,opt,epoch_num, test_summary_writer, batches_per_epoch, test_jet_writer, test_ele_writer, test_bkg_writer, test_mean_writer) test_mIoU_sum += test_output['mIoU'] logger.info('test mIoU sum: %10.4f',test_mIoU_sum / (epoch_num + 1)) if rank == 0: with test_summary_writer.as_default(): step = (epoch_num + 1) * batches_per_epoch tf.summary.scalar('metrics/mIoU_AOC', test_mIoU_sum / (epoch_num + 1),step=step)
def train_step(self, x, y, s=None, y_gt=None, flag=None, x_test=None, y_test=None, flag_test=None, **kwargs): """One training step. Args: x: [B, T, ...], inputs at each timestep. y: [B, T], label at each timestep. y_gt: [B, T], groundtruth at each timestep, if different from labels. x_test: [B, M, ...], inputs of the query set, optional. y_test: [B, M], groundtruth of the query set, optional. Returns: xent: Cross entropy loss. """ # tf.print('y', y[0], summarize=100) if self._distributed: import horovod.tensorflow as hvd if y_gt is None: y_gt = y with tf.GradientTape() as tape: if x_test is not None: # Additional query set (optional). assert y_test is not None logits, logits_test = self.forward( x, y, s=s, x_test=x_test, is_training=tf.constant(True)) logits_all = tf.concat([logits, logits_test], axis=1) # [B, T+N, Kmax] labels_all = tf.concat([y_gt, y_test], axis=1) # [B, T+N] else: logits = self.forward(x, y, s=s, is_training=tf.constant(True)) logits_all = logits labels_all = y_gt xent = self.compute_loss(logits_all, labels_all) # Cross entropy loss. if flag is not None: if flag_test is not None: flag_all = tf.concat([flag, flag_test], axis=1) else: flag_all = flag flag_ = tf.cast(flag_all, self.dtype) valid_sum = tf.reduce_sum(flag_) delta = tf.cast(tf.equal(valid_sum, 0.0), self.dtype) xent = tf.reduce_sum(xent * flag_) / (valid_sum + delta) else: xent = tf.reduce_mean(xent) # Regularizers. reg_loss = self._get_regularizer_loss(*self.regularized_weights()) loss = xent + reg_loss * self.wd # Apply gradients. if self._distributed: tape = hvd.DistributedGradientTape(tape) self.apply_gradients(loss, tape) return xent
def train_step(self, x, y, y_gt=None, flag=None, writer=None, first_batch=False, **kwargs): """One training step. Args: x: [B, T, ...], inputs at each timestep. y: [B, T], label at each timestep, to be fed as input. y_unk: [B, T], binary label indicating unknown, used as groundtruth. y_gt: [B, T], groundtruth at each timestep, if different from labels. x_test: [B, M, ...], inputs of the query set, optional. y_test: [B, M], groundtruth of the query set, optional. Returns: xent: Cross entropy loss. """ if self._distributed: import horovod.tensorflow as hvd if y_gt is None: y_gt = y B = tf.constant(x.shape[0]) T = tf.constant(x.shape[1]) with writer.as_default() if writer is not None else dummy_context_mgr( ) as gs: states = self.memory.get_initial_state(B, 64) DT = self.config.optimizer_config.inner_loop_truncate_steps # Data parallel training. xent_total = 0.0 xent_unk_total = 0.0 flag_total = tf.cast(tf.reduce_sum(flag), self.dtype) for t_start in range(0, self.config.num_steps, DT): t_end = tf.minimum(t_start + DT, T) with tf.GradientTape() as tape: loss, metric, states = self.compute_loss( x[:, t_start:t_end], y[:, t_start:t_end], y_gt[:, t_start:t_end], flag[:, t_start:t_end], t_start, DT, *states, **kwargs) # Apply gradients. if self._distributed: tape = hvd.DistributedGradientTape(tape) self.apply_gradients(loss, tape) # Sync weights initialization. if self._distributed and first_batch and tf.equal(t_start, 0): hvd.broadcast_variables(self.var_to_optimize(), root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) if self.config.set_backbone_lr: hvd.broadcast_variables(self._bb_optimizer.variables(), root_rank=0) flag_total_ = tf.reduce_sum( tf.cast(flag[:, t_start:t_end], self.dtype)) xent_total += metric['xent'] * flag_total_ / flag_total xent_unk_total += metric['xent_unk'] * flag_total_ / flag_total write_flag = self._distributed and hvd.rank() == 0 write_flag = write_flag or (not self._distributed) if write_flag and writer is not None: if tf.equal( tf.math.floormod( self._step // self._ratio + 1, self.config.train_config.steps_per_log), 0): tf.summary.scalar('xent_unk', xent_unk_total, step=self._step + 1) writer.flush() return xent_total
def train(model, train_db, validation_db, epochs, batch_size, learning_rate, model_dir): # 对dataset进行处理 # 预处理传入process函数 # shuffle用于把数据打散,参数越大打的越散 # 设定每一个batch的大小 train_db = train_db.map(process).shuffle( 10000, seed=np.random.randint(999)).batch(batch_size) validation_db = validation_db.map(process).shuffle( 10000, seed=np.random.randint(999)).batch(batch_size) # 获得sample数据,查看数据的shape信息,用于下一步定义网络的一些参数 train_iter = iter(train_db) train_sample = next(train_iter) print('train dataset x shape {}, train dataset y shape {}'.format( train_sample[0].shape, train_sample[1].shape)) # 第三步,learning rate随着horovod的size的增加,而扩展 optimazer = optimizers.Adam(lr=learning_rate * hvd.size()) # 设定epoch的次数 for epoch in range(epochs): for step, (x, y) in enumerate(train_db): # 对input进行reshape,对应model build的input_shape # x = tf.reshape(x, [-1,28*28]),不需要reshape了,传进来的数据已经是[b, 784] # tape包裹前向运算,用于记录varibale,好计算梯度 with tf.GradientTape() as tape: # 直接将x输入model,实际上调用的实例的__call__方法,输出结果为softmax后的预测数据 softmax = model(x) # 对y进行onehot编码,因为y的shape是[b,],而logits的shape是[b, 10],需要将y转换成[b, 10] y_hot = tf.one_hot(y, depth=10) # 这里设置两个损失函数mse和交叉熵,如果是分类问题,推荐使用交叉熵 # 注意,如果是使用logits计算交叉熵的化,需要指定参数from_logits=True,这里是softmax所以不需要了 loss_mse = tf.reduce_mean( tf.losses.mean_squared_error(y_hot, softmax)) loss_ce = tf.reduce_mean( tf.losses.categorical_crossentropy(y_hot, softmax)) # loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_hot, logits, from_logits=True)) # 第四步,用hvd tape包裹之前的tape,这样可以allreduce各个process的梯度然后在同步到各个process,用于更新variables tape = hvd.DistributedGradientTape(tape) # 计算对于交叉熵的参数的梯度 grads = tape.gradient(loss_ce, model.trainable_variables) # 更新梯度,这里参数的列表直接调用model.trainable_variables optimazer.apply_gradients(zip(grads, model.trainable_variables)) # 第五步,将初始化的varibles广播到所有的process,要确保多有的process在同一个起点开始 if epoch == 0 and step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimazer.variables(), root_rank=0) # 每100个step打印一次信息 if step % 100 == 0 and hvd.rank() == 0: print('epoch:{}\t step:{}\t loss_mse:{}\t loss_ce:{}\t'.format( epoch, step, float(loss_mse), float(loss_ce))) # 每个epoch计算一次准确率 total_corrects = 0 # 统计变量 total_number = 0 # 针对validation dataset进行测试 for x, y in validation_db: # 得到验证数据的预测结果 probs = model(x) # 取最大值的索引为预测值 preds = tf.cast(tf.argmax(probs, axis=1), dtype=tf.int32) # 累加正确的个数,和总数 corrects = tf.equal(y, preds) corrects = tf.reduce_sum(tf.cast(corrects, dtype=tf.int32)) total_corrects += corrects total_number += x.shape[0] # 计算测试数据集的准确率 acc = total_corrects / total_number if hvd.rank() == 0: print('accuracy={};'.format(acc)) # 两种保存model的方法都可以,low level:tf.saved_model.save(model, model_dir+'/'+datetime.now().strftime('%Y%m%d%H%M%S')) # 存储model的路径必须是数字类型的字符串 # 第六步,只有在rank=0时才存储checkpoint或者model if hvd.rank() == 0: model.save(model_dir + '/' + datetime.now().strftime('%Y%m%d%H%M%S'))
def train_step(self, x, y, s=None, y_gt=None, flag=None, x_test=None, y_test=None, flag_test=None, writer=None, **kwargs): """One training step. Args: x: [B, T, ...], inputs at each timestep. y: [B, T], label at each timestep, to be fed as input. y_unk: [B, T], binary label indicating unknown, used as groundtruth. y_gt: [B, T], groundtruth at each timestep, if different from labels. x_test: [B, M, ...], inputs of the query set, optional. y_test: [B, M], groundtruth of the query set, optional. Returns: xent: Cross entropy loss. """ if self._distributed: import horovod.tensorflow as hvd if y_gt is None: y_gt = y with writer.as_default() if writer is not None else dummy_context_mgr( ) as gs: with tf.GradientTape() as tape: loss, metric = self.compute_loss(x, y, y_gt, s=s, flag=flag, x_test=x_test, y_test=y_test, flag_test=flag_test, **kwargs) # Data parallel training. if self._distributed: xent_sync = tf.reduce_mean( hvd.allgather(tf.zeros([1], dtype=tf.float32) + metric['xent'], name='xent')) tape = hvd.DistributedGradientTape(tape) else: xent_sync = metric['xent'] # Apply gradients. # if not tf.math.is_nan(xent_sync): self.apply_gradients(loss, tape) write_flag = self._distributed and hvd.rank() == 0 write_flag = write_flag or (not self._distributed) if write_flag and writer is not None: if tf.equal( tf.math.floormod( self._step + 1, self.config.train_config.steps_per_log), 0): for name, val in metric.items(): if name != 'xent': tf.summary.scalar(name, val, step=self._step + 1) if self._ssl_store is not None: tf.summary.scalar('ssl write', tf.reduce_mean( tf.cast(self._ssl_store, tf.float32)), step=self._step + 1) writer.flush() return xent_sync
def main(_argv): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes) train_dataset = train_dataset.shuffle(buffer_size=1024) # TODO: not 1024 train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) if FLAGS.transfer != 'none': model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.mode == 'frozen': # freeze everything freeze_all(model) else: # reset top layers if FLAGS.tiny: # get initial weights init_model = YoloV3Tiny(FLAGS.size, training=True) else: init_model = YoloV3(FLAGS.size, training=True) if FLAGS.transfer == 'darknet': for l in model.layers: if l.name != 'yolo_darknet' and l.name.startswith('yolo_'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) elif FLAGS.transfer == 'no_output': for l in model.layers: if l.name.startswith('yolo_output'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) # Horovod: adjust learning rate based on number of GPUs. optimizer = tf.optimizers.Adam(FLAGS.learning_rate * hvd.size()) # Horovod: add Horovod DistributedOptimizer. ############################################### loss = [YoloLoss(anchors[mask]) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate( train_dataset.take(5717 // hvd.size())): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if batch == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) ############################# if hvd.rank() == 0: logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) ########################### avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss if hvd.rank() == 0: logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) if hvd.rank() == 0: logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() if hvd.rank() == 0: model.save_weights( 'checkpoints/horovod_yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0): with tf.GradientTape() as tape: [ input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible ] = inputs if not v2: is_impossible = None start_logits, end_logits, cls_logits = model( input_ids, attention_mask=input_mask, token_type_ids=segment_ids, start_positions=start_positions, end_positions=end_positions, cls_index=cls_index, p_mask=p_mask, is_impossible=is_impossible, position_ids=None, head_mask=None, inputs_embeds=None, training=True, )[0:3] # If we are on multi-GPU, split add a dimension if len(start_positions.shape) > 1: start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions") if len(end_positions.shape) > 1: end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions") if is_impossible is not None and len( is_impossible.shape) > 1 and v2 and cls_logits is not None: is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible") # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.shape[1] start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions") end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions") start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32)) end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32)) loss_value = (start_loss + end_loss) / 2 if v2: cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32)) loss_value += cls_loss_value * 0.5 unscaled_loss = tf.stop_gradient(loss_value) if amp: loss_value = opt.get_scaled_loss(loss_value) tape = hvd.DistributedGradientTape( tape, sparse_as_dense=True, compression=Compression.fp16 if fp16 else Compression.none) gradients = tape.gradient(loss_value, model.trainable_variables) if amp: gradients = opt.get_unscaled_gradients(gradients) (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm) opt.apply_gradients(zip(gradients, model.trainable_variables)) # , clip_norm=1.0) if init: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return unscaled_loss # , outputs#, tape.gradient(loss_value, model.trainable_variables)
def main(): hvd.init() n_epochs = 10 batch_size = 5 step = len(im) // batch_size params = parse_args(PARSER.parse_args()) optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate) ce_loss = tf.keras.metrics.Mean(name='ce_loss') f1_loss = tf.keras.metrics.Mean(name='dice_loss') checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) pb_i = Progbar(step, stateful_metrics=metrics_names) count = 0 for epoch in range(n_epochs): if count >= step: count = 0 features = im[epoch * batch_size:(epoch * batch_size) + batch_size] features = np.reshape(features, (len(features), features[0].shape[1], features[0].shape[2], features[0].shape[0])) features = features.astype('float32') labels = lb[epoch * batch_size:(epoch * batch_size) + batch_size] labels = np.reshape( labels, (len(labels), labels[0].shape[0], labels[0].shape[1], 1)) labels = labels.astype('float32') print(features.shape, labels.shape) print('Epoch {} out of epochs {}'.format(epoch, n_epochs)) for i, (features_, labels_) in enumerate(zip(features, labels)): with tf.GradientTape() as tape: output_map = model(features) crossentropy_loss, dice_loss = partial_losses( output_map, labels) added_losses = tf.add(crossentropy_loss, dice_loss, name='total_loss_ref') values = [('Xent', crossentropy_loss), ('added_losses', added_losses)] pb_i.add(1, values=values) # calculate the gradients using our tape and then update the # model weights tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(added_losses, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # Calculate something wrong here # val_total_loss = 0 # val_total_acc = 0 # total_val_num = 0 # for bIdx, (val_X, val_y) in enumerate(val_batch): # if bIdx >= features.shape[0]: # break # y_pred = model(val_X, training=False) print('Xen: ', crossentropy_loss, dice_loss, added_losses)
def train_step(self, x, y, y_gt=None, flag=None, writer=None, **kwargs): """One training step, with truncated backpropagation through time. Args: x: [B, T, ...], inputs at each timestep. y: [B, T], label at each timestep, to be fed as input. y_unk: [B, T], binary label indicating unknown, used as groundtruth. y_gt: [B, T], groundtruth at each timestep, if different from labels. x_test: [B, M, ...], inputs of the query set, optional. y_test: [B, M], groundtruth of the query set, optional. Returns: xent: Cross entropy loss. """ if self._distributed: import horovod.tensorflow as hvd if y_gt is None: y_gt = y B = tf.constant(x.shape[0]) T = tf.constant(x.shape[1]) DT = self.config.oml_config.inner_loop_truncate_steps LOGSTEP = self.config.train_config.steps_per_log assert DT > 1 with writer.as_default() if writer is not None else dummy_context_mgr( ) as gs: states = self.memory.get_initial_state(B) states_shape = [s.shape for s in states] xent_total = 0.0 xent_unk_total = 0.0 flag_total = tf.cast(tf.reduce_sum(flag), self.dtype) for t_start in tf.range(0, T, DT): # tf.print('t_start', t_start) with tf.GradientTape() as tape: # if tf.equal(t_start, 0): # states = self.memory.get_initial_state(B) # [s.set_shape(ss) for s, ss in zip(states, states_shape)] t_end = tf.minimum(t_start + DT, T) # tf.print('start', t_start, 'end', t_end) loss, metric, states = self.compute_loss( x[:, t_start:t_end], y[:, t_start:t_end], t_end - t_start, y_gt[:, t_start:t_end], flag[:, t_start:t_end], *states, **kwargs) if self._distributed: tape = hvd.DistributedGradientTape(tape) # Apply gradients. self.apply_gradients(loss, tape, add_step=tf.equal(t_start, 0)) flag_total_ = tf.reduce_sum( tf.cast(flag[:, t_start:t_end], self.dtype)) xent_total += metric['xent'] * flag_total_ / flag_total xent_unk_total += metric['xent_unk'] * flag_total_ / flag_total # tf.print('xent unk total', xent_unk_total) # Log xent unk if writer is not None: NSTEP = len(tf.range(0, T, DT)) cond = tf.logical_or( tf.equal(tf.math.floormod(self._step, LOGSTEP), 0), tf.equal(self._step, 1)) if cond: tf.summary.scalar('xent_unk', xent_unk_total, self._step) writer.flush() return xent_total
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Make sure the Fetcher worked mnist_filename = 'mnist.npz' mnist_path = os.path.join(cache_dir, mnist_filename) if not os.path.isfile(mnist_path): raise FileNotFoundError("Dataset not found. Looked in " + mnist_path) (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path=mnist_filename) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(32) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(20000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(mnist_model.variables, root_rank=0) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 50 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) emit({"batch": str(batch), "train_loss": "%.6f" % loss_value})
def train(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) # Horovod: adjust number of steps based on number of GPUs. images, images_path = get_celebA(FLAGS.output_size, FLAGS.n_epoch // hvd.size(), FLAGS.batch_size) G = get_generator([None, FLAGS.z_dim]) D = get_discriminator([None, FLAGS.output_size, FLAGS.output_size, FLAGS.c_dim]) G.train() D.train() d_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate * hvd.size(), beta1=FLAGS.beta1) # linear scaling rule g_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate * hvd.size(), beta1=FLAGS.beta1) step_counter = tf.train.get_or_create_global_step() n_step_epoch = int(len(images_path) // FLAGS.batch_size) for step, batch_images in enumerate(images): step_time = time.time() with tf.GradientTape(persistent=True) as tape: z = tf.contrib.distributions.Normal(0., 1.).sample([FLAGS.batch_size, FLAGS.z_dim]) #tf.placeholder(tf.float32, [None, z_dim], name='z_noise') d_logits = D(G(z)) d2_logits = D(batch_images) # discriminator: real images are labelled as 1 d_loss_real = tl.cost.sigmoid_cross_entropy(d2_logits, tf.ones_like(d2_logits), name='dreal') # discriminator: images from generator (fake) are labelled as 0 d_loss_fake = tl.cost.sigmoid_cross_entropy(d_logits, tf.zeros_like(d_logits), name='dfake') # cost for updating discriminator d_loss = d_loss_real + d_loss_fake # generator: try to make the the fake images look real (1) g_loss = tl.cost.sigmoid_cross_entropy(d_logits, tf.ones_like(d_logits), name='gfake') # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if step == 0: hvd.broadcast_variables(G.weights, root_rank=0) hvd.broadcast_variables(D.weights, root_rank=0) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) # grad = tape.gradient(d_loss, D.weights) d_optimizer.apply_gradients(zip(grad, D.weights), global_step=tf.train.get_or_create_global_step()) grad = tape.gradient(g_loss, G.weights) g_optimizer.apply_gradients(zip(grad, G.weights), global_step=tf.train.get_or_create_global_step()) # Horovod: print logging only on worker 0 if hvd.rank() == 0 print("Epoch: [{}/{}] [{}/{}] took: {:3f}, d_loss: {:5f}, g_loss: {:5f}".format(step//n_step_epoch, FLAGS.n_epoch, step, n_step_epoch, time.time()-step_time, d_loss, g_loss)) # Horovod: save checkpoints only on worker 0 if hvd.rank() == 0 and np.mod(step, FLAGS.save_step) == 0: G.save_weights('{}/G.npz'.format(FLAGS.checkpoint_dir), format='npz') D.save_weights('{}/D.npz'.format(FLAGS.checkpoint_dir), format='npz') result = G(z) tl.visualize.save_images(result.numpy(), [num_tiles, num_tiles], '{}/train_{:02d}_{:04d}.png'.format(FLAGS.sample_dir, step//n_step_epoch, step))
# rank 0 to the orther ranks hvd.broadcast_variables([slope, offset], root_rank=0) print( 'rank', hvd.rank(), 'inital slope = %12.6f\n initial offset = %12.6f' % (slope.numpy(), offset.numpy())) for xtr, ytr in dataset: with tf.GradientTape() as tape: yhat = slope * xtr + offset loss = tf.losses.mean_squared_error(yhat, ytr) # replace tensorflows' GradientTape for Horovod's # so that the gradients from all ranks are averaged tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss, [slope, offset]) opt.apply_gradients(zip(grads, [slope, offset]), global_step=tf.train.get_or_create_global_step()) history.append([slope.numpy(), offset.numpy(), loss.numpy()]) # tf.print('loss = %f (rank-%d)' % (loss, hvd.rank())) # saving arrays for plotting np.save('slope_hist_%s' % hvd.rank(), np.array(history)[:, 0]) np.save('offset_hist_%s' % hvd.rank(), np.array(history)[:, 1]) if hvd.rank() == 0: np.save('x_train', x_train) np.save('y_train', y_train)
def train_step(self, data): """Train step. Args: data: Tuple of (images, labels). Image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal.Input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py. Returns: A dict record loss info. """ images, labels = data with tf.GradientTape() as tape: if len(self.config.heads) == 2: cls_outputs, box_outputs, seg_outputs = self(images, training=True) elif 'object_detection' in self.config.heads: cls_outputs, box_outputs = self(images, training=True) elif 'segmentation' in self.config.heads: seg_outputs, = self(images, training=True) total_loss = 0 loss_vals = {} if 'object_detection' in self.config.heads: det_loss = self._detection_loss(cls_outputs, box_outputs, labels, loss_vals) total_loss += det_loss if 'segmentation' in self.config.heads: seg_loss_layer = self.loss['seg_loss'] seg_loss = seg_loss_layer(labels['image_masks'], seg_outputs) total_loss += seg_loss loss_vals['seg_loss'] = seg_loss reg_l2_loss = self._reg_l2_loss(self.config.weight_decay) loss_vals['reg_l2_loss'] = reg_l2_loss total_loss += reg_l2_loss if isinstance(self.optimizer, tf.keras.mixed_precision.LossScaleOptimizer): scaled_loss = self.optimizer.get_scaled_loss(total_loss) optimizer = self.optimizer._optimizer else: scaled_loss = total_loss optimizer = self.optimizer compress = get_mixed_precision_policy().compute_dtype == 'float16' tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16 \ if compress else hvd.Compression.none) loss_vals['loss'] = total_loss loss_vals['learning_rate'] = optimizer.learning_rate( optimizer.iterations) trainable_vars = self._freeze_vars() scaled_gradients = tape.gradient(scaled_loss, trainable_vars) if isinstance(self.optimizer, tf.keras.mixed_precision.LossScaleOptimizer): gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) else: gradients = scaled_gradients if self.config.clip_gradients_norm > 0: clip_norm = abs(self.config.clip_gradients_norm) gradients = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in gradients ] gradients, _ = tf.clip_by_global_norm(gradients, clip_norm) loss_vals['gradient_norm'] = tf.linalg.global_norm(gradients) self.optimizer.apply_gradients(zip(gradients, trainable_vars)) return loss_vals
def train_step(self, data): """Perform a single training step.""" start = time.time() with tf.GradientTape() as tape: x, beta = data tape.watch(x) states, data = self((x, beta), training=True) accept_prob = data.get('accept_prob', None) ploss, qloss = self.calc_losses(states, accept_prob) loss = ploss + qloss if self.aux_weight > 0: z = tf.random.normal(x.shape, dtype=x.dtype) states_, data_ = self((z, beta), training=True) accept_prob_ = data_.get('accept_prob', None) ploss_, qloss_ = self.calc_losses(states_, accept_prob_) loss += ploss_ + qloss_ if HAS_HOROVOD: tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables), ) metrics = AttrDict({ 'lr': self._get_lr(), 'dt': time.time() - start, 'loss': loss, }) if self.plaq_weight > 0 and self.charge_weight > 0: metrics.update({'ploss': ploss, 'qloss': qloss}) if self.aux_weight > 0: metrics.update({'ploss_aux': ploss_, 'qloss_aux': qloss_}) metrics.update({ 'accept_prob': accept_prob, 'eps': self.eps, 'beta': states.init.beta, }) if self._verbose: metrics.update({ 'Hf_start': data.forward.energies[0], 'Hf_mid': data.forward.energies[self.config.num_steps // 2], 'Hf_end': data.forward.energies[-1], 'Hb_start': data.backward.energies[0], 'Hb_mid': data.backward.energies[self.config.num_steps // 2], 'Hb_end': data.backward.energies[-1], # 'ld_f_start': data.forward.logdets[0], 'ld_f_mid': data.forward.logdets[self.config.num_steps // 2], 'ld_f_end': data.forward.logdets[-1], # 'ld_b_start': data.backward.logdets[0], 'ld_b_mid': data.backward.logdets[self.config.num_steps // 2], 'ld_b_end': data.backward.logdets[-1], # 'sumlogdet': sumlogdet.out, }) observables = self.calc_observables(states) metrics.update(**observables) metrics.update({ 'lr': self._get_lr(), }) # Horovod: # Broadcast initial variable states from rank 0 to all other # processes. This is necessary to ensure consistent initialization # of all workers when training is started with random weights or # restored from a checkpoint. # NOTE: # Broadcast should be done after the first gradient step to ensure # optimizer intialization. if self.optimizer.iterations == 0 and HAS_HOROVOD and NUM_WORKERS > 1: hvd.broadcast_variables(self.variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) return states.out.x, metrics
def main(_): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(32) checkpoint_dir = './checkpoints' step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt, step_counter=step_counter) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(20000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(0, mnist_model.variables) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 10 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: checkpoint.save(checkpoint_dir)
def get_distributed_tape(tape): return hvd.DistributedGradientTape(tape)